[V2] VECT: Support loop len control on EXTRACT_LAST vectorization

Message ID 20230810074909.492039-1-juzhe.zhong@rivai.ai
State Accepted
Headers
Series [V2] VECT: Support loop len control on EXTRACT_LAST vectorization |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

juzhe.zhong@rivai.ai Aug. 10, 2023, 7:49 a.m. UTC
  From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

Hi, Richard and Richi.

This patch add support live vectorization by VEC_EXTRACT for LEN loop control.

Consider this following case:

#include <stdint.h>

#define EXTRACT_LAST(TYPE)			\
  TYPE __attribute__ ((noinline, noclone))	\
  test_##TYPE (TYPE *x, int n, TYPE value)	\
  {						\
    TYPE last;					\
    for (int j = 0; j < n; ++j)			\
      {						\
	last = x[j];				\
	x[j] = last * value;			\
      }						\
    return last;				\
  }

#define TEST_ALL(T)				\
  T (uint8_t)					\

TEST_ALL (EXTRACT_LAST)

ARM SVE IR:

Preheader:
  max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });

Loop:
  ...
  # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
  ...
  vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
  vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
  .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
  ...
  next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
  ...

Epilogue:
  _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);

For RVV since we prefer len in loop control, after this patch for RVV:

Loop:
  ...
  loop_len_22 = SELECT_VL;
  vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
  vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
  .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
  ...

Epilogue:
  _25 = .VEC_EXTRACT (loop_len_22 - 1 - bias, vect_last_12.8_23);

Details of this approach:

1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p'  to enable live vectorization
            for LEN loop control.
   
   This function we check whether target support:
    - Use LEN as the loop control.
    - Support VEC_EXTRACT optab.

2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.

3. Step 3 - Gerenate VEC_EXTRACT (v, LEN - 1 - BIAS).

NOTE: This patch set 'vinfo->any_known_not_updated_vssa = true;' since the original STMT is a simple
      assignment wheras VEC_EXTRACT is neither pure nor const function according to internal-fn.def:

      DEF_INTERNAL_OPTAB_FN (VEC_EXTRACT, 0, vec_extract, vec_extract)

      If we don't set 'vinfo->any_known_not_updated_vssa' as true, it will cause ICE in:

        if (need_ssa_update_p (cfun))
          {
            gcc_assert (loop_vinfo->any_known_not_updated_vssa);  ----> Report assertion fail here.
            fun->gimple_df->ssa_renaming_needed = false;
            todo |= TODO_update_ssa_only_virtuals;
          }
      
      I saw there are 2 places set 'vinfo->any_known_not_updated_vssa' as true:
        
	- The one is in 'vectorizable_simd_clone_call':

	/* When the original call is pure or const but the SIMD ABI dictates
	 an aggregate return we will have to use a virtual definition and
	 in a loop eventually even need to add a virtual PHI.  That's
	 not straight-forward so allow to fix this up via renaming.  */
      if (gimple_call_lhs (stmt)
	  && !gimple_vdef (stmt)
	  && TREE_CODE (TREE_TYPE (TREE_TYPE (bestn->decl))) == ARRAY_TYPE)
	vinfo->any_known_not_updated_vssa = true;
       
       - The other is in 'vectorizable_load':
       
        if (memory_access_type == VMAT_LOAD_STORE_LANES)
	  vinfo->any_known_not_updated_vssa = true;

      It seems that they are the same reason as me doing in 'vectorizable_live_operation'.
      Feel free to correct me if I am wrong.

      Bootstrap and Regression on X86 passed.
	
gcc/ChangeLog:

        * tree-vect-loop.cc (vect_can_vectorize_extract_last_with_len_p): New function.
        (vectorizable_live_operation): Add loop LEN control.

---
 gcc/tree-vect-loop.cc | 74 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 68 insertions(+), 6 deletions(-)
  

Comments

Richard Biener Aug. 10, 2023, 7:58 a.m. UTC | #1
On Thu, 10 Aug 2023, juzhe.zhong@rivai.ai wrote:

> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> Hi, Richard and Richi.
> 
> This patch add support live vectorization by VEC_EXTRACT for LEN loop control.
> 
> Consider this following case:
> 
> #include <stdint.h>
> 
> #define EXTRACT_LAST(TYPE)			\
>   TYPE __attribute__ ((noinline, noclone))	\
>   test_##TYPE (TYPE *x, int n, TYPE value)	\
>   {						\
>     TYPE last;					\
>     for (int j = 0; j < n; ++j)			\
>       {						\
> 	last = x[j];				\
> 	x[j] = last * value;			\
>       }						\
>     return last;				\
>   }
> 
> #define TEST_ALL(T)				\
>   T (uint8_t)					\
> 
> TEST_ALL (EXTRACT_LAST)
> 
> ARM SVE IR:
> 
> Preheader:
>   max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
> 
> Loop:
>   ...
>   # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
>   ...
>   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
>   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>   .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
>   ...
>   next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
>   ...
> 
> Epilogue:
>   _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
> 
> For RVV since we prefer len in loop control, after this patch for RVV:
> 
> Loop:
>   ...
>   loop_len_22 = SELECT_VL;
>   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
>   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>   .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
>   ...
> 
> Epilogue:
>   _25 = .VEC_EXTRACT (loop_len_22 - 1 - bias, vect_last_12.8_23);
> 
> Details of this approach:
> 
> 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p'  to enable live vectorization
>             for LEN loop control.
>    
>    This function we check whether target support:
>     - Use LEN as the loop control.
>     - Support VEC_EXTRACT optab.
> 
> 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.
> 
> 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN - 1 - BIAS).
> 
> NOTE: This patch set 'vinfo->any_known_not_updated_vssa = true;' since the original STMT is a simple
>       assignment wheras VEC_EXTRACT is neither pure nor const function according to internal-fn.def:
> 
>       DEF_INTERNAL_OPTAB_FN (VEC_EXTRACT, 0, vec_extract, vec_extract)
> 
>       If we don't set 'vinfo->any_known_not_updated_vssa' as true, it will cause ICE in:
> 
>         if (need_ssa_update_p (cfun))
>           {
>             gcc_assert (loop_vinfo->any_known_not_updated_vssa);  ----> Report assertion fail here.
>             fun->gimple_df->ssa_renaming_needed = false;
>             todo |= TODO_update_ssa_only_virtuals;
>           }
>       
>       I saw there are 2 places set 'vinfo->any_known_not_updated_vssa' as true:
>         
> 	- The one is in 'vectorizable_simd_clone_call':
> 
> 	/* When the original call is pure or const but the SIMD ABI dictates
> 	 an aggregate return we will have to use a virtual definition and
> 	 in a loop eventually even need to add a virtual PHI.  That's
> 	 not straight-forward so allow to fix this up via renaming.  */
>       if (gimple_call_lhs (stmt)
> 	  && !gimple_vdef (stmt)
> 	  && TREE_CODE (TREE_TYPE (TREE_TYPE (bestn->decl))) == ARRAY_TYPE)
> 	vinfo->any_known_not_updated_vssa = true;
>        
>        - The other is in 'vectorizable_load':
>        
>         if (memory_access_type == VMAT_LOAD_STORE_LANES)
> 	  vinfo->any_known_not_updated_vssa = true;
> 
>       It seems that they are the same reason as me doing in 'vectorizable_live_operation'.
>       Feel free to correct me if I am wrong.

You should always manually update things.  Did you verify the mask
case is handled by this?

There's the odd

      if (stmts)
        {
          gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
          gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);

          /* Remove existing phi from lhs and create one copy from 
new_tree.  */
          tree lhs_phi = NULL_TREE;
          gimple_stmt_iterator gsi;
          for (gsi = gsi_start_phis (exit_bb);
               !gsi_end_p (gsi); gsi_next (&gsi))
            {
              gimple *phi = gsi_stmt (gsi);
              if ((gimple_phi_arg_def (phi, 0) == lhs))
                {
                  remove_phi_node (&gsi, false);
                  lhs_phi = gimple_phi_result (phi);
                  gimple *copy = gimple_build_assign (lhs_phi, new_tree);
                  gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
                  break;
                }
            }

code but I don't think it will create new LC PHIs for the mask, instead
it will break LC SSA as well by removing a PHI?

I guess as a temporary thing your approach is OK but we shouldn't
add these as part of new code - it's supposed to handle legacy
cases that we didn't fixup yet.

Richard.



>       Bootstrap and Regression on X86 passed.
> 	
> gcc/ChangeLog:
> 
>         * tree-vect-loop.cc (vect_can_vectorize_extract_last_with_len_p): New function.
>         (vectorizable_live_operation): Add loop LEN control.
> 
> ---
>  gcc/tree-vect-loop.cc | 74 +++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 68 insertions(+), 6 deletions(-)
> 
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 00058c3c13e..208918f53fb 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -8964,6 +8964,24 @@ vect_can_vectorize_without_simd_p (code_helper code)
>  	  && vect_can_vectorize_without_simd_p (tree_code (code)));
>  }
>  
> +/* Return true if target supports extract last vectorization with LEN.  */
> +
> +static bool
> +vect_can_vectorize_extract_last_with_len_p (tree vectype)
> +{
> +  /* Return false if target doesn't support LEN in loop control.  */
> +  machine_mode vmode;
> +  if (!get_len_load_store_mode (TYPE_MODE (vectype), true).exists (&vmode)
> +      || !get_len_load_store_mode (TYPE_MODE (vectype), false).exists (&vmode))
> +    return false;
> +
> +  /* Target need to support VEC_EXTRACT to extract the last active element.  */
> +  return convert_optab_handler (vec_extract_optab,
> +				TYPE_MODE (vectype),
> +				TYPE_MODE (TREE_TYPE (vectype)))
> +	 != CODE_FOR_nothing;
> +}
> +
>  /* Create vector init for vectorized iv.  */
>  static tree
>  vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
> @@ -10282,7 +10300,8 @@ vectorizable_live_operation (vec_info *vinfo,
>        if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
>  	{
>  	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> -					       OPTIMIZE_FOR_SPEED))
> +					       OPTIMIZE_FOR_SPEED)
> +	      && !vect_can_vectorize_extract_last_with_len_p (vectype))
>  	    {
>  	      if (dump_enabled_p ())
>  		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -10311,9 +10330,14 @@ vectorizable_live_operation (vec_info *vinfo,
>  	  else
>  	    {
>  	      gcc_assert (ncopies == 1 && !slp_node);
> -	      vect_record_loop_mask (loop_vinfo,
> -				     &LOOP_VINFO_MASKS (loop_vinfo),
> -				     1, vectype, NULL);
> +	      if (vect_can_vectorize_extract_last_with_len_p (vectype))
> +		vect_record_loop_len (loop_vinfo,
> +				      &LOOP_VINFO_LENS (loop_vinfo),
> +				      1, vectype, 1);
> +	      else
> +		vect_record_loop_mask (loop_vinfo,
> +				       &LOOP_VINFO_MASKS (loop_vinfo),
> +				       1, vectype, NULL);
>  	    }
>  	}
>        /* ???  Enable for loop costing as well.  */
> @@ -10339,7 +10363,9 @@ vectorizable_live_operation (vec_info *vinfo,
>    gimple *vec_stmt;
>    if (slp_node)
>      {
> -      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> +      gcc_assert (!loop_vinfo
> +		  || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +		      && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
>  
>        /* Get the correct slp vectorized stmt.  */
>        vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> @@ -10383,7 +10409,43 @@ vectorizable_live_operation (vec_info *vinfo,
>  
>        gimple_seq stmts = NULL;
>        tree new_tree;
> -      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> +      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> +	{
> +	  /* Emit:
> +
> +	       SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - BIAS - 1>
> +
> +	     where VEC_LHS is the vectorized live-out result and MASK is
> +	     the loop mask for the final iteration.  */
> +	  gcc_assert (ncopies == 1 && !slp_node);
> +	  tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
> +	  tree len
> +	    = vect_get_loop_len (loop_vinfo, gsi, &LOOP_VINFO_LENS (loop_vinfo),
> +				 1, vectype, 0, 0);
> +
> +	  /* BIAS + 1.  */
> +	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +	  tree bias_one
> +	    = size_binop (PLUS_EXPR, build_int_cst (TREE_TYPE (len), biasval),
> +			  build_one_cst (TREE_TYPE (len)));
> +
> +	  /* LAST_INDEX = LEN - (BIAS + 1).  */
> +	  tree last_index
> +	    = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (len), len, bias_one);
> +
> +	  tree scalar_res = gimple_build (&stmts, CFN_VEC_EXTRACT, scalar_type,
> +					  vec_lhs_phi, last_index);
> +
> +	  /* Convert the extracted vector element to the scalar type.  */
> +	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> +	  /* When the original stmt is an assignment but VEC_EXTRACT is not pure
> +	     or const since it may return a memory result.  We will have to use
> +	     a virtual definition and in a loop eventually even need to add a
> +	     virtual PHI. That's not straight-forward so allow to fix this up
> +	     via renaming.  */
> +	  vinfo->any_known_not_updated_vssa = true;
> +	}
> +      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>  	{
>  	  /* Emit:
>  
>
  
juzhe.zhong@rivai.ai Aug. 10, 2023, 8:15 a.m. UTC | #2
Hi, Richi.  Thanks so much for the review.


>> You should always manually update things.  Did you verify the mask
>>case is handled by this?
When we enable LEN as loop control. The only mask case will be the condition mask case.

Consider this following case:

int __attribute__ ((noinline, noclone))
condition_reduction (int *a, int min_v, int n)
{
  int last = 66; /* High start value.  */

  for (int i = 0; i < n; i++)
    if (a[i] < min_v)
      last = i;

  return last;
}

It will be handled in vectorizable_condition
by FOLD_EXTRACT_LAST (That's why I have said we will need to add LEN_FOLD_EXTRACT_LAST).

I have try many cases turns out vectorizable_live_operation only handle either loop LEN control or loop MASK control.
And I found all the mask cases of loop LEN control will be handled by vectorizable_condition with (LEN_FOLD_EXTRACT_LAST in my downstream).

IMHO, no mask case comes into 'vectorizable_live_operation' when we are using LEN as the loop control.

>> code but I don't think it will create new LC PHIs for the mask, instead
>> it will break LC SSA as well by removing a PHI?
I have tried several debug cases in ARM SVE turns out that the LC SSA is created by:
'rewrite_into_loop_closed_ssa'


Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-08-10 15:58
To: Ju-Zhe Zhong
CC: gcc-patches; richard.sandiford
Subject: Re: [PATCH V2] VECT: Support loop len control on EXTRACT_LAST vectorization
On Thu, 10 Aug 2023, juzhe.zhong@rivai.ai wrote:
 
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> Hi, Richard and Richi.
> 
> This patch add support live vectorization by VEC_EXTRACT for LEN loop control.
> 
> Consider this following case:
> 
> #include <stdint.h>
> 
> #define EXTRACT_LAST(TYPE) \
>   TYPE __attribute__ ((noinline, noclone)) \
>   test_##TYPE (TYPE *x, int n, TYPE value) \
>   { \
>     TYPE last; \
>     for (int j = 0; j < n; ++j) \
>       { \
> last = x[j]; \
> x[j] = last * value; \
>       } \
>     return last; \
>   }
> 
> #define TEST_ALL(T) \
>   T (uint8_t) \
> 
> TEST_ALL (EXTRACT_LAST)
> 
> ARM SVE IR:
> 
> Preheader:
>   max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
> 
> Loop:
>   ...
>   # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
>   ...
>   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
>   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>   .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
>   ...
>   next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
>   ...
> 
> Epilogue:
>   _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
> 
> For RVV since we prefer len in loop control, after this patch for RVV:
> 
> Loop:
>   ...
>   loop_len_22 = SELECT_VL;
>   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
>   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>   .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
>   ...
> 
> Epilogue:
>   _25 = .VEC_EXTRACT (loop_len_22 - 1 - bias, vect_last_12.8_23);
> 
> Details of this approach:
> 
> 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p'  to enable live vectorization
>             for LEN loop control.
>    
>    This function we check whether target support:
>     - Use LEN as the loop control.
>     - Support VEC_EXTRACT optab.
> 
> 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.
> 
> 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN - 1 - BIAS).
> 
> NOTE: This patch set 'vinfo->any_known_not_updated_vssa = true;' since the original STMT is a simple
>       assignment wheras VEC_EXTRACT is neither pure nor const function according to internal-fn.def:
> 
>       DEF_INTERNAL_OPTAB_FN (VEC_EXTRACT, 0, vec_extract, vec_extract)
> 
>       If we don't set 'vinfo->any_known_not_updated_vssa' as true, it will cause ICE in:
> 
>         if (need_ssa_update_p (cfun))
>           {
>             gcc_assert (loop_vinfo->any_known_not_updated_vssa);  ----> Report assertion fail here.
>             fun->gimple_df->ssa_renaming_needed = false;
>             todo |= TODO_update_ssa_only_virtuals;
>           }
>       
>       I saw there are 2 places set 'vinfo->any_known_not_updated_vssa' as true:
>         
> - The one is in 'vectorizable_simd_clone_call':
> 
> /* When the original call is pure or const but the SIMD ABI dictates
> an aggregate return we will have to use a virtual definition and
> in a loop eventually even need to add a virtual PHI.  That's
> not straight-forward so allow to fix this up via renaming.  */
>       if (gimple_call_lhs (stmt)
>   && !gimple_vdef (stmt)
>   && TREE_CODE (TREE_TYPE (TREE_TYPE (bestn->decl))) == ARRAY_TYPE)
> vinfo->any_known_not_updated_vssa = true;
>        
>        - The other is in 'vectorizable_load':
>        
>         if (memory_access_type == VMAT_LOAD_STORE_LANES)
>   vinfo->any_known_not_updated_vssa = true;
> 
>       It seems that they are the same reason as me doing in 'vectorizable_live_operation'.
>       Feel free to correct me if I am wrong.
 
You should always manually update things.  Did you verify the mask
case is handled by this?
 
There's the odd
 
      if (stmts)
        {
          gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
          gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
 
          /* Remove existing phi from lhs and create one copy from 
new_tree.  */
          tree lhs_phi = NULL_TREE;
          gimple_stmt_iterator gsi;
          for (gsi = gsi_start_phis (exit_bb);
               !gsi_end_p (gsi); gsi_next (&gsi))
            {
              gimple *phi = gsi_stmt (gsi);
              if ((gimple_phi_arg_def (phi, 0) == lhs))
                {
                  remove_phi_node (&gsi, false);
                  lhs_phi = gimple_phi_result (phi);
                  gimple *copy = gimple_build_assign (lhs_phi, new_tree);
                  gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
                  break;
                }
            }
 
code but I don't think it will create new LC PHIs for the mask, instead
it will break LC SSA as well by removing a PHI?
 
I guess as a temporary thing your approach is OK but we shouldn't
add these as part of new code - it's supposed to handle legacy
cases that we didn't fixup yet.
 
Richard.
 
 
 
>       Bootstrap and Regression on X86 passed.
> 
> gcc/ChangeLog:
> 
>         * tree-vect-loop.cc (vect_can_vectorize_extract_last_with_len_p): New function.
>         (vectorizable_live_operation): Add loop LEN control.
> 
> ---
>  gcc/tree-vect-loop.cc | 74 +++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 68 insertions(+), 6 deletions(-)
> 
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 00058c3c13e..208918f53fb 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -8964,6 +8964,24 @@ vect_can_vectorize_without_simd_p (code_helper code)
>    && vect_can_vectorize_without_simd_p (tree_code (code)));
>  }
>  
> +/* Return true if target supports extract last vectorization with LEN.  */
> +
> +static bool
> +vect_can_vectorize_extract_last_with_len_p (tree vectype)
> +{
> +  /* Return false if target doesn't support LEN in loop control.  */
> +  machine_mode vmode;
> +  if (!get_len_load_store_mode (TYPE_MODE (vectype), true).exists (&vmode)
> +      || !get_len_load_store_mode (TYPE_MODE (vectype), false).exists (&vmode))
> +    return false;
> +
> +  /* Target need to support VEC_EXTRACT to extract the last active element.  */
> +  return convert_optab_handler (vec_extract_optab,
> + TYPE_MODE (vectype),
> + TYPE_MODE (TREE_TYPE (vectype)))
> + != CODE_FOR_nothing;
> +}
> +
>  /* Create vector init for vectorized iv.  */
>  static tree
>  vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
> @@ -10282,7 +10300,8 @@ vectorizable_live_operation (vec_info *vinfo,
>        if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
>  {
>    if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> -        OPTIMIZE_FOR_SPEED))
> +        OPTIMIZE_FOR_SPEED)
> +       && !vect_can_vectorize_extract_last_with_len_p (vectype))
>      {
>        if (dump_enabled_p ())
>  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -10311,9 +10330,14 @@ vectorizable_live_operation (vec_info *vinfo,
>    else
>      {
>        gcc_assert (ncopies == 1 && !slp_node);
> -       vect_record_loop_mask (loop_vinfo,
> -      &LOOP_VINFO_MASKS (loop_vinfo),
> -      1, vectype, NULL);
> +       if (vect_can_vectorize_extract_last_with_len_p (vectype))
> + vect_record_loop_len (loop_vinfo,
> +       &LOOP_VINFO_LENS (loop_vinfo),
> +       1, vectype, 1);
> +       else
> + vect_record_loop_mask (loop_vinfo,
> +        &LOOP_VINFO_MASKS (loop_vinfo),
> +        1, vectype, NULL);
>      }
>  }
>        /* ???  Enable for loop costing as well.  */
> @@ -10339,7 +10363,9 @@ vectorizable_live_operation (vec_info *vinfo,
>    gimple *vec_stmt;
>    if (slp_node)
>      {
> -      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> +      gcc_assert (!loop_vinfo
> +   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
>  
>        /* Get the correct slp vectorized stmt.  */
>        vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> @@ -10383,7 +10409,43 @@ vectorizable_live_operation (vec_info *vinfo,
>  
>        gimple_seq stmts = NULL;
>        tree new_tree;
> -      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> +      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> + {
> +   /* Emit:
> +
> +        SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - BIAS - 1>
> +
> +      where VEC_LHS is the vectorized live-out result and MASK is
> +      the loop mask for the final iteration.  */
> +   gcc_assert (ncopies == 1 && !slp_node);
> +   tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
> +   tree len
> +     = vect_get_loop_len (loop_vinfo, gsi, &LOOP_VINFO_LENS (loop_vinfo),
> + 1, vectype, 0, 0);
> +
> +   /* BIAS + 1.  */
> +   signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +   tree bias_one
> +     = size_binop (PLUS_EXPR, build_int_cst (TREE_TYPE (len), biasval),
> +   build_one_cst (TREE_TYPE (len)));
> +
> +   /* LAST_INDEX = LEN - (BIAS + 1).  */
> +   tree last_index
> +     = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (len), len, bias_one);
> +
> +   tree scalar_res = gimple_build (&stmts, CFN_VEC_EXTRACT, scalar_type,
> +   vec_lhs_phi, last_index);
> +
> +   /* Convert the extracted vector element to the scalar type.  */
> +   new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> +   /* When the original stmt is an assignment but VEC_EXTRACT is not pure
> +      or const since it may return a memory result.  We will have to use
> +      a virtual definition and in a loop eventually even need to add a
> +      virtual PHI. That's not straight-forward so allow to fix this up
> +      via renaming.  */
> +   vinfo->any_known_not_updated_vssa = true;
> + }
> +      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>  {
>    /* Emit:
>  
> 
 
-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
  
juzhe.zhong@rivai.ai Aug. 10, 2023, 8:35 a.m. UTC | #3
>> I guess as a temporary thing your approach is OK but we shouldn't
>> add these as part of new code - it's supposed to handle legacy
>> cases that we didn't fixup yet.

Do you mean we need to fix LC SSA PHI flow so that we don't need to 
set vinfo->any_known_not_updated_vssa = true ?

After it's fixed then this patch with removing 'vinfo->any_known_not_updated_vssa = true' is ok for trunk, am I right?

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-08-10 15:58
To: Ju-Zhe Zhong
CC: gcc-patches; richard.sandiford
Subject: Re: [PATCH V2] VECT: Support loop len control on EXTRACT_LAST vectorization
On Thu, 10 Aug 2023, juzhe.zhong@rivai.ai wrote:
 
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> Hi, Richard and Richi.
> 
> This patch add support live vectorization by VEC_EXTRACT for LEN loop control.
> 
> Consider this following case:
> 
> #include <stdint.h>
> 
> #define EXTRACT_LAST(TYPE) \
>   TYPE __attribute__ ((noinline, noclone)) \
>   test_##TYPE (TYPE *x, int n, TYPE value) \
>   { \
>     TYPE last; \
>     for (int j = 0; j < n; ++j) \
>       { \
> last = x[j]; \
> x[j] = last * value; \
>       } \
>     return last; \
>   }
> 
> #define TEST_ALL(T) \
>   T (uint8_t) \
> 
> TEST_ALL (EXTRACT_LAST)
> 
> ARM SVE IR:
> 
> Preheader:
>   max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
> 
> Loop:
>   ...
>   # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
>   ...
>   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
>   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>   .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
>   ...
>   next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
>   ...
> 
> Epilogue:
>   _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
> 
> For RVV since we prefer len in loop control, after this patch for RVV:
> 
> Loop:
>   ...
>   loop_len_22 = SELECT_VL;
>   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
>   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
>   .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
>   ...
> 
> Epilogue:
>   _25 = .VEC_EXTRACT (loop_len_22 - 1 - bias, vect_last_12.8_23);
> 
> Details of this approach:
> 
> 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p'  to enable live vectorization
>             for LEN loop control.
>    
>    This function we check whether target support:
>     - Use LEN as the loop control.
>     - Support VEC_EXTRACT optab.
> 
> 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.
> 
> 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN - 1 - BIAS).
> 
> NOTE: This patch set 'vinfo->any_known_not_updated_vssa = true;' since the original STMT is a simple
>       assignment wheras VEC_EXTRACT is neither pure nor const function according to internal-fn.def:
> 
>       DEF_INTERNAL_OPTAB_FN (VEC_EXTRACT, 0, vec_extract, vec_extract)
> 
>       If we don't set 'vinfo->any_known_not_updated_vssa' as true, it will cause ICE in:
> 
>         if (need_ssa_update_p (cfun))
>           {
>             gcc_assert (loop_vinfo->any_known_not_updated_vssa);  ----> Report assertion fail here.
>             fun->gimple_df->ssa_renaming_needed = false;
>             todo |= TODO_update_ssa_only_virtuals;
>           }
>       
>       I saw there are 2 places set 'vinfo->any_known_not_updated_vssa' as true:
>         
> - The one is in 'vectorizable_simd_clone_call':
> 
> /* When the original call is pure or const but the SIMD ABI dictates
> an aggregate return we will have to use a virtual definition and
> in a loop eventually even need to add a virtual PHI.  That's
> not straight-forward so allow to fix this up via renaming.  */
>       if (gimple_call_lhs (stmt)
>   && !gimple_vdef (stmt)
>   && TREE_CODE (TREE_TYPE (TREE_TYPE (bestn->decl))) == ARRAY_TYPE)
> vinfo->any_known_not_updated_vssa = true;
>        
>        - The other is in 'vectorizable_load':
>        
>         if (memory_access_type == VMAT_LOAD_STORE_LANES)
>   vinfo->any_known_not_updated_vssa = true;
> 
>       It seems that they are the same reason as me doing in 'vectorizable_live_operation'.
>       Feel free to correct me if I am wrong.
 
You should always manually update things.  Did you verify the mask
case is handled by this?
 
There's the odd
 
      if (stmts)
        {
          gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
          gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
 
          /* Remove existing phi from lhs and create one copy from 
new_tree.  */
          tree lhs_phi = NULL_TREE;
          gimple_stmt_iterator gsi;
          for (gsi = gsi_start_phis (exit_bb);
               !gsi_end_p (gsi); gsi_next (&gsi))
            {
              gimple *phi = gsi_stmt (gsi);
              if ((gimple_phi_arg_def (phi, 0) == lhs))
                {
                  remove_phi_node (&gsi, false);
                  lhs_phi = gimple_phi_result (phi);
                  gimple *copy = gimple_build_assign (lhs_phi, new_tree);
                  gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
                  break;
                }
            }
 
code but I don't think it will create new LC PHIs for the mask, instead
it will break LC SSA as well by removing a PHI?
 
I guess as a temporary thing your approach is OK but we shouldn't
add these as part of new code - it's supposed to handle legacy
cases that we didn't fixup yet.
 
Richard.
 
 
 
>       Bootstrap and Regression on X86 passed.
> 
> gcc/ChangeLog:
> 
>         * tree-vect-loop.cc (vect_can_vectorize_extract_last_with_len_p): New function.
>         (vectorizable_live_operation): Add loop LEN control.
> 
> ---
>  gcc/tree-vect-loop.cc | 74 +++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 68 insertions(+), 6 deletions(-)
> 
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 00058c3c13e..208918f53fb 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -8964,6 +8964,24 @@ vect_can_vectorize_without_simd_p (code_helper code)
>    && vect_can_vectorize_without_simd_p (tree_code (code)));
>  }
>  
> +/* Return true if target supports extract last vectorization with LEN.  */
> +
> +static bool
> +vect_can_vectorize_extract_last_with_len_p (tree vectype)
> +{
> +  /* Return false if target doesn't support LEN in loop control.  */
> +  machine_mode vmode;
> +  if (!get_len_load_store_mode (TYPE_MODE (vectype), true).exists (&vmode)
> +      || !get_len_load_store_mode (TYPE_MODE (vectype), false).exists (&vmode))
> +    return false;
> +
> +  /* Target need to support VEC_EXTRACT to extract the last active element.  */
> +  return convert_optab_handler (vec_extract_optab,
> + TYPE_MODE (vectype),
> + TYPE_MODE (TREE_TYPE (vectype)))
> + != CODE_FOR_nothing;
> +}
> +
>  /* Create vector init for vectorized iv.  */
>  static tree
>  vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
> @@ -10282,7 +10300,8 @@ vectorizable_live_operation (vec_info *vinfo,
>        if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
>  {
>    if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> -        OPTIMIZE_FOR_SPEED))
> +        OPTIMIZE_FOR_SPEED)
> +       && !vect_can_vectorize_extract_last_with_len_p (vectype))
>      {
>        if (dump_enabled_p ())
>  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -10311,9 +10330,14 @@ vectorizable_live_operation (vec_info *vinfo,
>    else
>      {
>        gcc_assert (ncopies == 1 && !slp_node);
> -       vect_record_loop_mask (loop_vinfo,
> -      &LOOP_VINFO_MASKS (loop_vinfo),
> -      1, vectype, NULL);
> +       if (vect_can_vectorize_extract_last_with_len_p (vectype))
> + vect_record_loop_len (loop_vinfo,
> +       &LOOP_VINFO_LENS (loop_vinfo),
> +       1, vectype, 1);
> +       else
> + vect_record_loop_mask (loop_vinfo,
> +        &LOOP_VINFO_MASKS (loop_vinfo),
> +        1, vectype, NULL);
>      }
>  }
>        /* ???  Enable for loop costing as well.  */
> @@ -10339,7 +10363,9 @@ vectorizable_live_operation (vec_info *vinfo,
>    gimple *vec_stmt;
>    if (slp_node)
>      {
> -      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> +      gcc_assert (!loop_vinfo
> +   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
>  
>        /* Get the correct slp vectorized stmt.  */
>        vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> @@ -10383,7 +10409,43 @@ vectorizable_live_operation (vec_info *vinfo,
>  
>        gimple_seq stmts = NULL;
>        tree new_tree;
> -      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> +      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> + {
> +   /* Emit:
> +
> +        SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - BIAS - 1>
> +
> +      where VEC_LHS is the vectorized live-out result and MASK is
> +      the loop mask for the final iteration.  */
> +   gcc_assert (ncopies == 1 && !slp_node);
> +   tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
> +   tree len
> +     = vect_get_loop_len (loop_vinfo, gsi, &LOOP_VINFO_LENS (loop_vinfo),
> + 1, vectype, 0, 0);
> +
> +   /* BIAS + 1.  */
> +   signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +   tree bias_one
> +     = size_binop (PLUS_EXPR, build_int_cst (TREE_TYPE (len), biasval),
> +   build_one_cst (TREE_TYPE (len)));
> +
> +   /* LAST_INDEX = LEN - (BIAS + 1).  */
> +   tree last_index
> +     = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (len), len, bias_one);
> +
> +   tree scalar_res = gimple_build (&stmts, CFN_VEC_EXTRACT, scalar_type,
> +   vec_lhs_phi, last_index);
> +
> +   /* Convert the extracted vector element to the scalar type.  */
> +   new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> +   /* When the original stmt is an assignment but VEC_EXTRACT is not pure
> +      or const since it may return a memory result.  We will have to use
> +      a virtual definition and in a loop eventually even need to add a
> +      virtual PHI. That's not straight-forward so allow to fix this up
> +      via renaming.  */
> +   vinfo->any_known_not_updated_vssa = true;
> + }
> +      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>  {
>    /* Emit:
>  
> 
 
-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
  
Richard Biener Aug. 10, 2023, 11:09 a.m. UTC | #4
On Thu, 10 Aug 2023, juzhe.zhong@rivai.ai wrote:

> >> I guess as a temporary thing your approach is OK but we shouldn't
> >> add these as part of new code - it's supposed to handle legacy
> >> cases that we didn't fixup yet.
> 
> Do you mean we need to fix LC SSA PHI flow so that we don't need to 
> set vinfo->any_known_not_updated_vssa = true ?
> 
> After it's fixed then this patch with removing 'vinfo->any_known_not_updated_vssa = true' is ok for trunk, am I right?

I want to know why we don't need this for SVE fully masked loops.  What
inserts the required LC SSA PHI in that case?

> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Biener
> Date: 2023-08-10 15:58
> To: Ju-Zhe Zhong
> CC: gcc-patches; richard.sandiford
> Subject: Re: [PATCH V2] VECT: Support loop len control on EXTRACT_LAST vectorization
> On Thu, 10 Aug 2023, juzhe.zhong@rivai.ai wrote:
>  
> > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> > 
> > Hi, Richard and Richi.
> > 
> > This patch add support live vectorization by VEC_EXTRACT for LEN loop control.
> > 
> > Consider this following case:
> > 
> > #include <stdint.h>
> > 
> > #define EXTRACT_LAST(TYPE) \
> >   TYPE __attribute__ ((noinline, noclone)) \
> >   test_##TYPE (TYPE *x, int n, TYPE value) \
> >   { \
> >     TYPE last; \
> >     for (int j = 0; j < n; ++j) \
> >       { \
> > last = x[j]; \
> > x[j] = last * value; \
> >       } \
> >     return last; \
> >   }
> > 
> > #define TEST_ALL(T) \
> >   T (uint8_t) \
> > 
> > TEST_ALL (EXTRACT_LAST)
> > 
> > ARM SVE IR:
> > 
> > Preheader:
> >   max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
> > 
> > Loop:
> >   ...
> >   # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
> >   ...
> >   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
> >   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> >   .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
> >   ...
> >   next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
> >   ...
> > 
> > Epilogue:
> >   _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
> > 
> > For RVV since we prefer len in loop control, after this patch for RVV:
> > 
> > Loop:
> >   ...
> >   loop_len_22 = SELECT_VL;
> >   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
> >   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> >   .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
> >   ...
> > 
> > Epilogue:
> >   _25 = .VEC_EXTRACT (loop_len_22 - 1 - bias, vect_last_12.8_23);
> > 
> > Details of this approach:
> > 
> > 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p'  to enable live vectorization
> >             for LEN loop control.
> >    
> >    This function we check whether target support:
> >     - Use LEN as the loop control.
> >     - Support VEC_EXTRACT optab.
> > 
> > 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.
> > 
> > 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN - 1 - BIAS).
> > 
> > NOTE: This patch set 'vinfo->any_known_not_updated_vssa = true;' since the original STMT is a simple
> >       assignment wheras VEC_EXTRACT is neither pure nor const function according to internal-fn.def:
> > 
> >       DEF_INTERNAL_OPTAB_FN (VEC_EXTRACT, 0, vec_extract, vec_extract)
> > 
> >       If we don't set 'vinfo->any_known_not_updated_vssa' as true, it will cause ICE in:
> > 
> >         if (need_ssa_update_p (cfun))
> >           {
> >             gcc_assert (loop_vinfo->any_known_not_updated_vssa);  ----> Report assertion fail here.
> >             fun->gimple_df->ssa_renaming_needed = false;
> >             todo |= TODO_update_ssa_only_virtuals;
> >           }
> >       
> >       I saw there are 2 places set 'vinfo->any_known_not_updated_vssa' as true:
> >         
> > - The one is in 'vectorizable_simd_clone_call':
> > 
> > /* When the original call is pure or const but the SIMD ABI dictates
> > an aggregate return we will have to use a virtual definition and
> > in a loop eventually even need to add a virtual PHI.  That's
> > not straight-forward so allow to fix this up via renaming.  */
> >       if (gimple_call_lhs (stmt)
> >   && !gimple_vdef (stmt)
> >   && TREE_CODE (TREE_TYPE (TREE_TYPE (bestn->decl))) == ARRAY_TYPE)
> > vinfo->any_known_not_updated_vssa = true;
> >        
> >        - The other is in 'vectorizable_load':
> >        
> >         if (memory_access_type == VMAT_LOAD_STORE_LANES)
> >   vinfo->any_known_not_updated_vssa = true;
> > 
> >       It seems that they are the same reason as me doing in 'vectorizable_live_operation'.
> >       Feel free to correct me if I am wrong.
>  
> You should always manually update things.  Did you verify the mask
> case is handled by this?
>  
> There's the odd
>  
>       if (stmts)
>         {
>           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
>           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
>  
>           /* Remove existing phi from lhs and create one copy from 
> new_tree.  */
>           tree lhs_phi = NULL_TREE;
>           gimple_stmt_iterator gsi;
>           for (gsi = gsi_start_phis (exit_bb);
>                !gsi_end_p (gsi); gsi_next (&gsi))
>             {
>               gimple *phi = gsi_stmt (gsi);
>               if ((gimple_phi_arg_def (phi, 0) == lhs))
>                 {
>                   remove_phi_node (&gsi, false);
>                   lhs_phi = gimple_phi_result (phi);
>                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
>                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
>                   break;
>                 }
>             }
>  
> code but I don't think it will create new LC PHIs for the mask, instead
> it will break LC SSA as well by removing a PHI?
>  
> I guess as a temporary thing your approach is OK but we shouldn't
> add these as part of new code - it's supposed to handle legacy
> cases that we didn't fixup yet.
>  
> Richard.
>  
>  
>  
> >       Bootstrap and Regression on X86 passed.
> > 
> > gcc/ChangeLog:
> > 
> >         * tree-vect-loop.cc (vect_can_vectorize_extract_last_with_len_p): New function.
> >         (vectorizable_live_operation): Add loop LEN control.
> > 
> > ---
> >  gcc/tree-vect-loop.cc | 74 +++++++++++++++++++++++++++++++++++++++----
> >  1 file changed, 68 insertions(+), 6 deletions(-)
> > 
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > index 00058c3c13e..208918f53fb 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -8964,6 +8964,24 @@ vect_can_vectorize_without_simd_p (code_helper code)
> >    && vect_can_vectorize_without_simd_p (tree_code (code)));
> >  }
> >  
> > +/* Return true if target supports extract last vectorization with LEN.  */
> > +
> > +static bool
> > +vect_can_vectorize_extract_last_with_len_p (tree vectype)
> > +{
> > +  /* Return false if target doesn't support LEN in loop control.  */
> > +  machine_mode vmode;
> > +  if (!get_len_load_store_mode (TYPE_MODE (vectype), true).exists (&vmode)
> > +      || !get_len_load_store_mode (TYPE_MODE (vectype), false).exists (&vmode))
> > +    return false;
> > +
> > +  /* Target need to support VEC_EXTRACT to extract the last active element.  */
> > +  return convert_optab_handler (vec_extract_optab,
> > + TYPE_MODE (vectype),
> > + TYPE_MODE (TREE_TYPE (vectype)))
> > + != CODE_FOR_nothing;
> > +}
> > +
> >  /* Create vector init for vectorized iv.  */
> >  static tree
> >  vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
> > @@ -10282,7 +10300,8 @@ vectorizable_live_operation (vec_info *vinfo,
> >        if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> >  {
> >    if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> > -        OPTIMIZE_FOR_SPEED))
> > +        OPTIMIZE_FOR_SPEED)
> > +       && !vect_can_vectorize_extract_last_with_len_p (vectype))
> >      {
> >        if (dump_enabled_p ())
> >  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > @@ -10311,9 +10330,14 @@ vectorizable_live_operation (vec_info *vinfo,
> >    else
> >      {
> >        gcc_assert (ncopies == 1 && !slp_node);
> > -       vect_record_loop_mask (loop_vinfo,
> > -      &LOOP_VINFO_MASKS (loop_vinfo),
> > -      1, vectype, NULL);
> > +       if (vect_can_vectorize_extract_last_with_len_p (vectype))
> > + vect_record_loop_len (loop_vinfo,
> > +       &LOOP_VINFO_LENS (loop_vinfo),
> > +       1, vectype, 1);
> > +       else
> > + vect_record_loop_mask (loop_vinfo,
> > +        &LOOP_VINFO_MASKS (loop_vinfo),
> > +        1, vectype, NULL);
> >      }
> >  }
> >        /* ???  Enable for loop costing as well.  */
> > @@ -10339,7 +10363,9 @@ vectorizable_live_operation (vec_info *vinfo,
> >    gimple *vec_stmt;
> >    if (slp_node)
> >      {
> > -      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> > +      gcc_assert (!loop_vinfo
> > +   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> > +       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
> >  
> >        /* Get the correct slp vectorized stmt.  */
> >        vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> > @@ -10383,7 +10409,43 @@ vectorizable_live_operation (vec_info *vinfo,
> >  
> >        gimple_seq stmts = NULL;
> >        tree new_tree;
> > -      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> > +      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> > + {
> > +   /* Emit:
> > +
> > +        SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - BIAS - 1>
> > +
> > +      where VEC_LHS is the vectorized live-out result and MASK is
> > +      the loop mask for the final iteration.  */
> > +   gcc_assert (ncopies == 1 && !slp_node);
> > +   tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
> > +   tree len
> > +     = vect_get_loop_len (loop_vinfo, gsi, &LOOP_VINFO_LENS (loop_vinfo),
> > + 1, vectype, 0, 0);
> > +
> > +   /* BIAS + 1.  */
> > +   signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > +   tree bias_one
> > +     = size_binop (PLUS_EXPR, build_int_cst (TREE_TYPE (len), biasval),
> > +   build_one_cst (TREE_TYPE (len)));
> > +
> > +   /* LAST_INDEX = LEN - (BIAS + 1).  */
> > +   tree last_index
> > +     = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (len), len, bias_one);
> > +
> > +   tree scalar_res = gimple_build (&stmts, CFN_VEC_EXTRACT, scalar_type,
> > +   vec_lhs_phi, last_index);
> > +
> > +   /* Convert the extracted vector element to the scalar type.  */
> > +   new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> > +   /* When the original stmt is an assignment but VEC_EXTRACT is not pure
> > +      or const since it may return a memory result.  We will have to use
> > +      a virtual definition and in a loop eventually even need to add a
> > +      virtual PHI. That's not straight-forward so allow to fix this up
> > +      via renaming.  */
> > +   vinfo->any_known_not_updated_vssa = true;
> > + }
> > +      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> >  {
> >    /* Emit:
> >  
> > 
>  
>
  
juzhe.zhong@rivai.ai Aug. 10, 2023, 11:38 a.m. UTC | #5
Hi, Richi.

>> What inserts the required LC SSA PHI in that case?

Here is the flow how GCC insert LC SSA PHI flow for ARM SVE.
You can see this following 'vect' dump details:
https://godbolt.org/z/564o87oz3 

You can see this following information:

;; Created LCSSA PHI: loop_mask_36 = PHI <loop_mask_22(3)>
  # loop_mask_36 = PHI <loop_mask_22(3)>
  _25 = .EXTRACT_LAST (loop_mask_36, vect_last_12.8_24);
  last_17 = _25;

The '# loop_mask_36 = PHI <loop_mask_22(3)>' is inserted as follows:

Step 1 - Enter file tree-vectorizer.cc
In the function pass_vectorize::execute (function *fun): 1358
'rewrite_into_loop_closed_ssa' is the key function insert the LC SSA PHI for ARM SVE in mask loop case.

Step 2 - Investigate more into 'rewrite_into_loop_closed_ssa':
In file tree-ssa-loop-manip.cc:628, 'rewrite_into_loop_closed_ssa' is directly calling 'rewrite_into_loop_closed_ssa_1'.
Step 3 - Investigate 'rewrite_into_loop_closed_ssa_1':
In file tree-ssa-loop-manip.cc:588 which is the function 'find_uses_to_rename' that:
/* Marks names matching USE_FLAGS that are used outside of the loop they are
   defined in for rewrite.  Records the set of blocks in which the ssa names are
   used to USE_BLOCKS.  Record the SSA names that will need exit PHIs in
   NEED_PHIS.  If CHANGED_BBS is not NULL, scan only blocks in this set.  */

static void
find_uses_to_rename (bitmap changed_bbs, bitmap *use_blocks, bitmap need_phis,
         int use_flags)
{
  basic_block bb;
  unsigned index;
  bitmap_iterator bi;

  if (changed_bbs)
    EXECUTE_IF_SET_IN_BITMAP (changed_bbs, 0, index, bi)
      {
  bb = BASIC_BLOCK_FOR_FN (cfun, index);
  if (bb)
    find_uses_to_rename_bb (bb, use_blocks, need_phis, use_flags);
      }
  else
    FOR_EACH_BB_FN (bb, cfun)
      find_uses_to_rename_bb (bb, use_blocks, need_phis, use_flags);
}

This function is iterating all blocks of the function to set the BITMAP which SSA need to be renamed then the later function will insert LC SSA for it.

In file tree-ssa-loop-manip.cc:606 which is the function 'add_exit_phis' that is the real function that is adding LC SSA by calling
this eventually:
/* Add a loop-closing PHI for VAR in basic block EXIT.  */

static void
add_exit_phi (basic_block exit, tree var)
{
  gphi *phi;
  edge e;
  edge_iterator ei;

  /* Check that at least one of the edges entering the EXIT block exits
     the loop, or a superloop of that loop, that VAR is defined in.  */
  if (flag_checking)
    {
      gimple *def_stmt = SSA_NAME_DEF_STMT (var);
      basic_block def_bb = gimple_bb (def_stmt);
      FOR_EACH_EDGE (e, ei, exit->preds)
  {
    class loop *aloop = find_common_loop (def_bb->loop_father,
             e->src->loop_father);
    if (!flow_bb_inside_loop_p (aloop, e->dest))
      break;
  }
      gcc_assert (e);
    }

  phi = create_phi_node (NULL_TREE, exit);
  create_new_def_for (var, phi, gimple_phi_result_ptr (phi));
  FOR_EACH_EDGE (e, ei, exit->preds)
    add_phi_arg (phi, var, e, UNKNOWN_LOCATION);

  if (dump_file && (dump_flags & TDF_DETAILS))
    {
      fprintf (dump_file, ";; Created LCSSA PHI: ");
      print_gimple_stmt (dump_file, phi, 0, dump_flags);
    }
}


This is how it works for ARM SVE in EXTRACT_LAST. Such flow (rewrite_into_loop_closed_ssa) can always insert LC SSA for RVV which is using length loop.

However,

>> I want to know why we don't need this for SVE fully masked loops. 

Before entering 'rewrite_into_loop_closed_ssa', there is a check here that RVV assertion failed but ARM SVE passed:

  /* We should not have to update virtual SSA form here but some
     transforms involve creating new virtual definitions which makes
     updating difficult.
     We delay the actual update to the end of the pass but avoid
     confusing ourselves by forcing need_ssa_update_p () to false.  */
  unsigned todo = 0;
  if (need_ssa_update_p (cfun))
    {
      gcc_assert (loop_vinfo->any_known_not_updated_vssa);
      fun->gimple_df->ssa_renaming_needed = false;
      todo |= TODO_update_ssa_only_virtuals;
    }

in tree-vectorizer.cc, function 'vect_transform_loops'
The assertion (gcc_assert (loop_vinfo->any_known_not_updated_vssa);)
failed for RVV since it is false.

The reason why ARM SVE can pass is that the STMT1 before 'vectorizable_live_operation' and STMT2 after vectorization of 'vectorizable_live_operation'
are both CONST or PURE since ARM SVE is using EXTRACT_LAST, here is the define of 'EXTRACT_LAST' internal function:
/* Extract the last active element from a vector.  */
DEF_INTERNAL_OPTAB_FN (EXTRACT_LAST, ECF_CONST | ECF_NOTHROW,
           extract_last, fold_left)

You can see 'EXTRACT_LAST' is ECF_CONST.

Wheras, RVV will fail since it is 'VEC_EXTRACT' which is not ECF_CONST:
DEF_INTERNAL_OPTAB_FN (VEC_EXTRACT, 0, vec_extract, vec_extract)

When I changed VEC_EXTRACT into ECF_CONST, we don't need 'vinfo->any_known_not_updated_vssa = true'
The flow can perfectly work and no different from ARM SVE.

However, I found we can't make 'VEC_EXTRACT' as ECF_CONST since I found some targets use VEC_EXTRACT, extract element into a memory.

So.... I use 'vinfo->any_known_not_updated_vssa = true'

The alternative approach I think is adding IFN_EXTRACT_LAST_LEN as Richard said, and make IFN_EXTRACT_LAST_LEN as ECF_CONST, it can definitely work
but such pattern is redundant since we can reuse 'VEC_EXTRACT' pattern which is suitable for us.

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-08-10 19:09
To: juzhe.zhong@rivai.ai
CC: gcc-patches; richard.sandiford
Subject: Re: Re: [PATCH V2] VECT: Support loop len control on EXTRACT_LAST vectorization
On Thu, 10 Aug 2023, juzhe.zhong@rivai.ai wrote:
 
> >> I guess as a temporary thing your approach is OK but we shouldn't
> >> add these as part of new code - it's supposed to handle legacy
> >> cases that we didn't fixup yet.
> 
> Do you mean we need to fix LC SSA PHI flow so that we don't need to 
> set vinfo->any_known_not_updated_vssa = true ?
> 
> After it's fixed then this patch with removing 'vinfo->any_known_not_updated_vssa = true' is ok for trunk, am I right?
 
I want to know why we don't need this for SVE fully masked loops.  What
inserts the required LC SSA PHI in that case?
 
> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Biener
> Date: 2023-08-10 15:58
> To: Ju-Zhe Zhong
> CC: gcc-patches; richard.sandiford
> Subject: Re: [PATCH V2] VECT: Support loop len control on EXTRACT_LAST vectorization
> On Thu, 10 Aug 2023, juzhe.zhong@rivai.ai wrote:
>  
> > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> > 
> > Hi, Richard and Richi.
> > 
> > This patch add support live vectorization by VEC_EXTRACT for LEN loop control.
> > 
> > Consider this following case:
> > 
> > #include <stdint.h>
> > 
> > #define EXTRACT_LAST(TYPE) \
> >   TYPE __attribute__ ((noinline, noclone)) \
> >   test_##TYPE (TYPE *x, int n, TYPE value) \
> >   { \
> >     TYPE last; \
> >     for (int j = 0; j < n; ++j) \
> >       { \
> > last = x[j]; \
> > x[j] = last * value; \
> >       } \
> >     return last; \
> >   }
> > 
> > #define TEST_ALL(T) \
> >   T (uint8_t) \
> > 
> > TEST_ALL (EXTRACT_LAST)
> > 
> > ARM SVE IR:
> > 
> > Preheader:
> >   max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
> > 
> > Loop:
> >   ...
> >   # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
> >   ...
> >   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
> >   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> >   .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
> >   ...
> >   next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
> >   ...
> > 
> > Epilogue:
> >   _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
> > 
> > For RVV since we prefer len in loop control, after this patch for RVV:
> > 
> > Loop:
> >   ...
> >   loop_len_22 = SELECT_VL;
> >   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
> >   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> >   .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
> >   ...
> > 
> > Epilogue:
> >   _25 = .VEC_EXTRACT (loop_len_22 - 1 - bias, vect_last_12.8_23);
> > 
> > Details of this approach:
> > 
> > 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p'  to enable live vectorization
> >             for LEN loop control.
> >    
> >    This function we check whether target support:
> >     - Use LEN as the loop control.
> >     - Support VEC_EXTRACT optab.
> > 
> > 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.
> > 
> > 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN - 1 - BIAS).
> > 
> > NOTE: This patch set 'vinfo->any_known_not_updated_vssa = true;' since the original STMT is a simple
> >       assignment wheras VEC_EXTRACT is neither pure nor const function according to internal-fn.def:
> > 
> >       DEF_INTERNAL_OPTAB_FN (VEC_EXTRACT, 0, vec_extract, vec_extract)
> > 
> >       If we don't set 'vinfo->any_known_not_updated_vssa' as true, it will cause ICE in:
> > 
> >         if (need_ssa_update_p (cfun))
> >           {
> >             gcc_assert (loop_vinfo->any_known_not_updated_vssa);  ----> Report assertion fail here.
> >             fun->gimple_df->ssa_renaming_needed = false;
> >             todo |= TODO_update_ssa_only_virtuals;
> >           }
> >       
> >       I saw there are 2 places set 'vinfo->any_known_not_updated_vssa' as true:
> >         
> > - The one is in 'vectorizable_simd_clone_call':
> > 
> > /* When the original call is pure or const but the SIMD ABI dictates
> > an aggregate return we will have to use a virtual definition and
> > in a loop eventually even need to add a virtual PHI.  That's
> > not straight-forward so allow to fix this up via renaming.  */
> >       if (gimple_call_lhs (stmt)
> >   && !gimple_vdef (stmt)
> >   && TREE_CODE (TREE_TYPE (TREE_TYPE (bestn->decl))) == ARRAY_TYPE)
> > vinfo->any_known_not_updated_vssa = true;
> >        
> >        - The other is in 'vectorizable_load':
> >        
> >         if (memory_access_type == VMAT_LOAD_STORE_LANES)
> >   vinfo->any_known_not_updated_vssa = true;
> > 
> >       It seems that they are the same reason as me doing in 'vectorizable_live_operation'.
> >       Feel free to correct me if I am wrong.
>  
> You should always manually update things.  Did you verify the mask
> case is handled by this?
>  
> There's the odd
>  
>       if (stmts)
>         {
>           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
>           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
>  
>           /* Remove existing phi from lhs and create one copy from 
> new_tree.  */
>           tree lhs_phi = NULL_TREE;
>           gimple_stmt_iterator gsi;
>           for (gsi = gsi_start_phis (exit_bb);
>                !gsi_end_p (gsi); gsi_next (&gsi))
>             {
>               gimple *phi = gsi_stmt (gsi);
>               if ((gimple_phi_arg_def (phi, 0) == lhs))
>                 {
>                   remove_phi_node (&gsi, false);
>                   lhs_phi = gimple_phi_result (phi);
>                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
>                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
>                   break;
>                 }
>             }
>  
> code but I don't think it will create new LC PHIs for the mask, instead
> it will break LC SSA as well by removing a PHI?
>  
> I guess as a temporary thing your approach is OK but we shouldn't
> add these as part of new code - it's supposed to handle legacy
> cases that we didn't fixup yet.
>  
> Richard.
>  
>  
>  
> >       Bootstrap and Regression on X86 passed.
> > 
> > gcc/ChangeLog:
> > 
> >         * tree-vect-loop.cc (vect_can_vectorize_extract_last_with_len_p): New function.
> >         (vectorizable_live_operation): Add loop LEN control.
> > 
> > ---
> >  gcc/tree-vect-loop.cc | 74 +++++++++++++++++++++++++++++++++++++++----
> >  1 file changed, 68 insertions(+), 6 deletions(-)
> > 
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > index 00058c3c13e..208918f53fb 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -8964,6 +8964,24 @@ vect_can_vectorize_without_simd_p (code_helper code)
> >    && vect_can_vectorize_without_simd_p (tree_code (code)));
> >  }
> >  
> > +/* Return true if target supports extract last vectorization with LEN.  */
> > +
> > +static bool
> > +vect_can_vectorize_extract_last_with_len_p (tree vectype)
> > +{
> > +  /* Return false if target doesn't support LEN in loop control.  */
> > +  machine_mode vmode;
> > +  if (!get_len_load_store_mode (TYPE_MODE (vectype), true).exists (&vmode)
> > +      || !get_len_load_store_mode (TYPE_MODE (vectype), false).exists (&vmode))
> > +    return false;
> > +
> > +  /* Target need to support VEC_EXTRACT to extract the last active element.  */
> > +  return convert_optab_handler (vec_extract_optab,
> > + TYPE_MODE (vectype),
> > + TYPE_MODE (TREE_TYPE (vectype)))
> > + != CODE_FOR_nothing;
> > +}
> > +
> >  /* Create vector init for vectorized iv.  */
> >  static tree
> >  vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
> > @@ -10282,7 +10300,8 @@ vectorizable_live_operation (vec_info *vinfo,
> >        if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> >  {
> >    if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> > -        OPTIMIZE_FOR_SPEED))
> > +        OPTIMIZE_FOR_SPEED)
> > +       && !vect_can_vectorize_extract_last_with_len_p (vectype))
> >      {
> >        if (dump_enabled_p ())
> >  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > @@ -10311,9 +10330,14 @@ vectorizable_live_operation (vec_info *vinfo,
> >    else
> >      {
> >        gcc_assert (ncopies == 1 && !slp_node);
> > -       vect_record_loop_mask (loop_vinfo,
> > -      &LOOP_VINFO_MASKS (loop_vinfo),
> > -      1, vectype, NULL);
> > +       if (vect_can_vectorize_extract_last_with_len_p (vectype))
> > + vect_record_loop_len (loop_vinfo,
> > +       &LOOP_VINFO_LENS (loop_vinfo),
> > +       1, vectype, 1);
> > +       else
> > + vect_record_loop_mask (loop_vinfo,
> > +        &LOOP_VINFO_MASKS (loop_vinfo),
> > +        1, vectype, NULL);
> >      }
> >  }
> >        /* ???  Enable for loop costing as well.  */
> > @@ -10339,7 +10363,9 @@ vectorizable_live_operation (vec_info *vinfo,
> >    gimple *vec_stmt;
> >    if (slp_node)
> >      {
> > -      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> > +      gcc_assert (!loop_vinfo
> > +   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> > +       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
> >  
> >        /* Get the correct slp vectorized stmt.  */
> >        vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> > @@ -10383,7 +10409,43 @@ vectorizable_live_operation (vec_info *vinfo,
> >  
> >        gimple_seq stmts = NULL;
> >        tree new_tree;
> > -      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> > +      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> > + {
> > +   /* Emit:
> > +
> > +        SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - BIAS - 1>
> > +
> > +      where VEC_LHS is the vectorized live-out result and MASK is
> > +      the loop mask for the final iteration.  */
> > +   gcc_assert (ncopies == 1 && !slp_node);
> > +   tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
> > +   tree len
> > +     = vect_get_loop_len (loop_vinfo, gsi, &LOOP_VINFO_LENS (loop_vinfo),
> > + 1, vectype, 0, 0);
> > +
> > +   /* BIAS + 1.  */
> > +   signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > +   tree bias_one
> > +     = size_binop (PLUS_EXPR, build_int_cst (TREE_TYPE (len), biasval),
> > +   build_one_cst (TREE_TYPE (len)));
> > +
> > +   /* LAST_INDEX = LEN - (BIAS + 1).  */
> > +   tree last_index
> > +     = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (len), len, bias_one);
> > +
> > +   tree scalar_res = gimple_build (&stmts, CFN_VEC_EXTRACT, scalar_type,
> > +   vec_lhs_phi, last_index);
> > +
> > +   /* Convert the extracted vector element to the scalar type.  */
> > +   new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> > +   /* When the original stmt is an assignment but VEC_EXTRACT is not pure
> > +      or const since it may return a memory result.  We will have to use
> > +      a virtual definition and in a loop eventually even need to add a
> > +      virtual PHI. That's not straight-forward so allow to fix this up
> > +      via renaming.  */
> > +   vinfo->any_known_not_updated_vssa = true;
> > + }
> > +      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> >  {
> >    /* Emit:
> >  
> > 
>  
> 
 
-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
  
Richard Biener Aug. 10, 2023, 12:14 p.m. UTC | #6
On Thu, 10 Aug 2023, juzhe.zhong@rivai.ai wrote:

> Hi, Richi.
> 
> >> What inserts the required LC SSA PHI in that case?
> 
> Here is the flow how GCC insert LC SSA PHI flow for ARM SVE.
> You can see this following 'vect' dump details:
> https://godbolt.org/z/564o87oz3 
> 
> You can see this following information:
> 
> ;; Created LCSSA PHI: loop_mask_36 = PHI <loop_mask_22(3)>
>   # loop_mask_36 = PHI <loop_mask_22(3)>
>   _25 = .EXTRACT_LAST (loop_mask_36, vect_last_12.8_24);
>   last_17 = _25;
> 
> The '# loop_mask_36 = PHI <loop_mask_22(3)>' is inserted as follows:
> 
> Step 1 - Enter file tree-vectorizer.cc
> In the function pass_vectorize::execute (function *fun): 1358
> 'rewrite_into_loop_closed_ssa' is the key function insert the LC SSA PHI for ARM SVE in mask loop case.
> 
> Step 2 - Investigate more into 'rewrite_into_loop_closed_ssa':
> In file tree-ssa-loop-manip.cc:628, 'rewrite_into_loop_closed_ssa' is directly calling 'rewrite_into_loop_closed_ssa_1'.
> Step 3 - Investigate 'rewrite_into_loop_closed_ssa_1':
> In file tree-ssa-loop-manip.cc:588 which is the function 'find_uses_to_rename' that:
> /* Marks names matching USE_FLAGS that are used outside of the loop they are
>    defined in for rewrite.  Records the set of blocks in which the ssa names are
>    used to USE_BLOCKS.  Record the SSA names that will need exit PHIs in
>    NEED_PHIS.  If CHANGED_BBS is not NULL, scan only blocks in this set.  */
> 
> static void
> find_uses_to_rename (bitmap changed_bbs, bitmap *use_blocks, bitmap need_phis,
>          int use_flags)
> {
>   basic_block bb;
>   unsigned index;
>   bitmap_iterator bi;
> 
>   if (changed_bbs)
>     EXECUTE_IF_SET_IN_BITMAP (changed_bbs, 0, index, bi)
>       {
>   bb = BASIC_BLOCK_FOR_FN (cfun, index);
>   if (bb)
>     find_uses_to_rename_bb (bb, use_blocks, need_phis, use_flags);
>       }
>   else
>     FOR_EACH_BB_FN (bb, cfun)
>       find_uses_to_rename_bb (bb, use_blocks, need_phis, use_flags);
> }
> 
> This function is iterating all blocks of the function to set the BITMAP which SSA need to be renamed then the later function will insert LC SSA for it.
> 
> In file tree-ssa-loop-manip.cc:606 which is the function 'add_exit_phis' that is the real function that is adding LC SSA by calling
> this eventually:
> /* Add a loop-closing PHI for VAR in basic block EXIT.  */
> 
> static void
> add_exit_phi (basic_block exit, tree var)
> {
>   gphi *phi;
>   edge e;
>   edge_iterator ei;
> 
>   /* Check that at least one of the edges entering the EXIT block exits
>      the loop, or a superloop of that loop, that VAR is defined in.  */
>   if (flag_checking)
>     {
>       gimple *def_stmt = SSA_NAME_DEF_STMT (var);
>       basic_block def_bb = gimple_bb (def_stmt);
>       FOR_EACH_EDGE (e, ei, exit->preds)
>   {
>     class loop *aloop = find_common_loop (def_bb->loop_father,
>              e->src->loop_father);
>     if (!flow_bb_inside_loop_p (aloop, e->dest))
>       break;
>   }
>       gcc_assert (e);
>     }
> 
>   phi = create_phi_node (NULL_TREE, exit);
>   create_new_def_for (var, phi, gimple_phi_result_ptr (phi));
>   FOR_EACH_EDGE (e, ei, exit->preds)
>     add_phi_arg (phi, var, e, UNKNOWN_LOCATION);
> 
>   if (dump_file && (dump_flags & TDF_DETAILS))
>     {
>       fprintf (dump_file, ";; Created LCSSA PHI: ");
>       print_gimple_stmt (dump_file, phi, 0, dump_flags);
>     }
> }
> 
> 
> This is how it works for ARM SVE in EXTRACT_LAST. Such flow (rewrite_into_loop_closed_ssa) can always insert LC SSA for RVV which is using length loop.
> 
> However,
> 
> >> I want to know why we don't need this for SVE fully masked loops. 
> 
> Before entering 'rewrite_into_loop_closed_ssa', there is a check here that RVV assertion failed but ARM SVE passed:
> 
>   /* We should not have to update virtual SSA form here but some
>      transforms involve creating new virtual definitions which makes
>      updating difficult.
>      We delay the actual update to the end of the pass but avoid
>      confusing ourselves by forcing need_ssa_update_p () to false.  */
>   unsigned todo = 0;
>   if (need_ssa_update_p (cfun))
>     {
>       gcc_assert (loop_vinfo->any_known_not_updated_vssa);
>       fun->gimple_df->ssa_renaming_needed = false;
>       todo |= TODO_update_ssa_only_virtuals;
>     }
> 
> in tree-vectorizer.cc, function 'vect_transform_loops'
> The assertion (gcc_assert (loop_vinfo->any_known_not_updated_vssa);)
> failed for RVV since it is false.
> 
> The reason why ARM SVE can pass is that the STMT1 before 'vectorizable_live_operation' and STMT2 after vectorization of 'vectorizable_live_operation'
> are both CONST or PURE since ARM SVE is using EXTRACT_LAST, here is the define of 'EXTRACT_LAST' internal function:
> /* Extract the last active element from a vector.  */
> DEF_INTERNAL_OPTAB_FN (EXTRACT_LAST, ECF_CONST | ECF_NOTHROW,
>            extract_last, fold_left)
> 
> You can see 'EXTRACT_LAST' is ECF_CONST.
> 
> Wheras, RVV will fail since it is 'VEC_EXTRACT' which is not ECF_CONST:
> DEF_INTERNAL_OPTAB_FN (VEC_EXTRACT, 0, vec_extract, vec_extract)
> 
> When I changed VEC_EXTRACT into ECF_CONST, we don't need 'vinfo->any_known_not_updated_vssa = true'
> The flow can perfectly work and no different from ARM SVE.
> 
> However, I found we can't make 'VEC_EXTRACT' as ECF_CONST since I found some targets use VEC_EXTRACT, extract element into a memory.
> 
> So.... I use 'vinfo->any_known_not_updated_vssa = true'
> 
> The alternative approach I think is adding IFN_EXTRACT_LAST_LEN as Richard said, and make IFN_EXTRACT_LAST_LEN as ECF_CONST, it can definitely work
> but such pattern is redundant since we can reuse 'VEC_EXTRACT' pattern which is suitable for us.

Hmm, I think VEC_EXTRACT and VEC_SET should be ECF_CONST.  Maybe the 
GIMPLE ISEL
comments do not match the implementation, but then that should be fixed?

/* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls 
to
   internal function based on vector type of selected expansion.

   For vec_set:

     VIEW_CONVERT_EXPR<int[4]>(u)[_1] = i_4(D);
   =>
     _7 = u;
     _8 = .VEC_SET (_7, i_4(D), _1);
     u = _8;
  
   For vec_extract:

      _3 = VIEW_CONVERT_EXPR<intD.1[4]>(vD.2208)[idx_2(D)];
   =>
      _4 = vD.2208;
      _3 = .VEC_EXTRACT (_4, idx_2(D));  */

Richard.

> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Biener
> Date: 2023-08-10 19:09
> To: juzhe.zhong@rivai.ai
> CC: gcc-patches; richard.sandiford
> Subject: Re: Re: [PATCH V2] VECT: Support loop len control on EXTRACT_LAST vectorization
> On Thu, 10 Aug 2023, juzhe.zhong@rivai.ai wrote:
>  
> > >> I guess as a temporary thing your approach is OK but we shouldn't
> > >> add these as part of new code - it's supposed to handle legacy
> > >> cases that we didn't fixup yet.
> > 
> > Do you mean we need to fix LC SSA PHI flow so that we don't need to 
> > set vinfo->any_known_not_updated_vssa = true ?
> > 
> > After it's fixed then this patch with removing 'vinfo->any_known_not_updated_vssa = true' is ok for trunk, am I right?
>  
> I want to know why we don't need this for SVE fully masked loops.  What
> inserts the required LC SSA PHI in that case?
>  
> > Thanks.
> > 
> > 
> > juzhe.zhong@rivai.ai
> >  
> > From: Richard Biener
> > Date: 2023-08-10 15:58
> > To: Ju-Zhe Zhong
> > CC: gcc-patches; richard.sandiford
> > Subject: Re: [PATCH V2] VECT: Support loop len control on EXTRACT_LAST vectorization
> > On Thu, 10 Aug 2023, juzhe.zhong@rivai.ai wrote:
> >  
> > > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> > > 
> > > Hi, Richard and Richi.
> > > 
> > > This patch add support live vectorization by VEC_EXTRACT for LEN loop control.
> > > 
> > > Consider this following case:
> > > 
> > > #include <stdint.h>
> > > 
> > > #define EXTRACT_LAST(TYPE) \
> > >   TYPE __attribute__ ((noinline, noclone)) \
> > >   test_##TYPE (TYPE *x, int n, TYPE value) \
> > >   { \
> > >     TYPE last; \
> > >     for (int j = 0; j < n; ++j) \
> > >       { \
> > > last = x[j]; \
> > > x[j] = last * value; \
> > >       } \
> > >     return last; \
> > >   }
> > > 
> > > #define TEST_ALL(T) \
> > >   T (uint8_t) \
> > > 
> > > TEST_ALL (EXTRACT_LAST)
> > > 
> > > ARM SVE IR:
> > > 
> > > Preheader:
> > >   max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
> > > 
> > > Loop:
> > >   ...
> > >   # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
> > >   ...
> > >   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
> > >   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> > >   .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
> > >   ...
> > >   next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
> > >   ...
> > > 
> > > Epilogue:
> > >   _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
> > > 
> > > For RVV since we prefer len in loop control, after this patch for RVV:
> > > 
> > > Loop:
> > >   ...
> > >   loop_len_22 = SELECT_VL;
> > >   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
> > >   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> > >   .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
> > >   ...
> > > 
> > > Epilogue:
> > >   _25 = .VEC_EXTRACT (loop_len_22 - 1 - bias, vect_last_12.8_23);
> > > 
> > > Details of this approach:
> > > 
> > > 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p'  to enable live vectorization
> > >             for LEN loop control.
> > >    
> > >    This function we check whether target support:
> > >     - Use LEN as the loop control.
> > >     - Support VEC_EXTRACT optab.
> > > 
> > > 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.
> > > 
> > > 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN - 1 - BIAS).
> > > 
> > > NOTE: This patch set 'vinfo->any_known_not_updated_vssa = true;' since the original STMT is a simple
> > >       assignment wheras VEC_EXTRACT is neither pure nor const function according to internal-fn.def:
> > > 
> > >       DEF_INTERNAL_OPTAB_FN (VEC_EXTRACT, 0, vec_extract, vec_extract)
> > > 
> > >       If we don't set 'vinfo->any_known_not_updated_vssa' as true, it will cause ICE in:
> > > 
> > >         if (need_ssa_update_p (cfun))
> > >           {
> > >             gcc_assert (loop_vinfo->any_known_not_updated_vssa);  ----> Report assertion fail here.
> > >             fun->gimple_df->ssa_renaming_needed = false;
> > >             todo |= TODO_update_ssa_only_virtuals;
> > >           }
> > >       
> > >       I saw there are 2 places set 'vinfo->any_known_not_updated_vssa' as true:
> > >         
> > > - The one is in 'vectorizable_simd_clone_call':
> > > 
> > > /* When the original call is pure or const but the SIMD ABI dictates
> > > an aggregate return we will have to use a virtual definition and
> > > in a loop eventually even need to add a virtual PHI.  That's
> > > not straight-forward so allow to fix this up via renaming.  */
> > >       if (gimple_call_lhs (stmt)
> > >   && !gimple_vdef (stmt)
> > >   && TREE_CODE (TREE_TYPE (TREE_TYPE (bestn->decl))) == ARRAY_TYPE)
> > > vinfo->any_known_not_updated_vssa = true;
> > >        
> > >        - The other is in 'vectorizable_load':
> > >        
> > >         if (memory_access_type == VMAT_LOAD_STORE_LANES)
> > >   vinfo->any_known_not_updated_vssa = true;
> > > 
> > >       It seems that they are the same reason as me doing in 'vectorizable_live_operation'.
> > >       Feel free to correct me if I am wrong.
> >  
> > You should always manually update things.  Did you verify the mask
> > case is handled by this?
> >  
> > There's the odd
> >  
> >       if (stmts)
> >         {
> >           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
> >           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
> >  
> >           /* Remove existing phi from lhs and create one copy from 
> > new_tree.  */
> >           tree lhs_phi = NULL_TREE;
> >           gimple_stmt_iterator gsi;
> >           for (gsi = gsi_start_phis (exit_bb);
> >                !gsi_end_p (gsi); gsi_next (&gsi))
> >             {
> >               gimple *phi = gsi_stmt (gsi);
> >               if ((gimple_phi_arg_def (phi, 0) == lhs))
> >                 {
> >                   remove_phi_node (&gsi, false);
> >                   lhs_phi = gimple_phi_result (phi);
> >                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
> >                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
> >                   break;
> >                 }
> >             }
> >  
> > code but I don't think it will create new LC PHIs for the mask, instead
> > it will break LC SSA as well by removing a PHI?
> >  
> > I guess as a temporary thing your approach is OK but we shouldn't
> > add these as part of new code - it's supposed to handle legacy
> > cases that we didn't fixup yet.
> >  
> > Richard.
> >  
> >  
> >  
> > >       Bootstrap and Regression on X86 passed.
> > > 
> > > gcc/ChangeLog:
> > > 
> > >         * tree-vect-loop.cc (vect_can_vectorize_extract_last_with_len_p): New function.
> > >         (vectorizable_live_operation): Add loop LEN control.
> > > 
> > > ---
> > >  gcc/tree-vect-loop.cc | 74 +++++++++++++++++++++++++++++++++++++++----
> > >  1 file changed, 68 insertions(+), 6 deletions(-)
> > > 
> > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > > index 00058c3c13e..208918f53fb 100644
> > > --- a/gcc/tree-vect-loop.cc
> > > +++ b/gcc/tree-vect-loop.cc
> > > @@ -8964,6 +8964,24 @@ vect_can_vectorize_without_simd_p (code_helper code)
> > >    && vect_can_vectorize_without_simd_p (tree_code (code)));
> > >  }
> > >  
> > > +/* Return true if target supports extract last vectorization with LEN.  */
> > > +
> > > +static bool
> > > +vect_can_vectorize_extract_last_with_len_p (tree vectype)
> > > +{
> > > +  /* Return false if target doesn't support LEN in loop control.  */
> > > +  machine_mode vmode;
> > > +  if (!get_len_load_store_mode (TYPE_MODE (vectype), true).exists (&vmode)
> > > +      || !get_len_load_store_mode (TYPE_MODE (vectype), false).exists (&vmode))
> > > +    return false;
> > > +
> > > +  /* Target need to support VEC_EXTRACT to extract the last active element.  */
> > > +  return convert_optab_handler (vec_extract_optab,
> > > + TYPE_MODE (vectype),
> > > + TYPE_MODE (TREE_TYPE (vectype)))
> > > + != CODE_FOR_nothing;
> > > +}
> > > +
> > >  /* Create vector init for vectorized iv.  */
> > >  static tree
> > >  vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
> > > @@ -10282,7 +10300,8 @@ vectorizable_live_operation (vec_info *vinfo,
> > >        if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> > >  {
> > >    if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> > > -        OPTIMIZE_FOR_SPEED))
> > > +        OPTIMIZE_FOR_SPEED)
> > > +       && !vect_can_vectorize_extract_last_with_len_p (vectype))
> > >      {
> > >        if (dump_enabled_p ())
> > >  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > @@ -10311,9 +10330,14 @@ vectorizable_live_operation (vec_info *vinfo,
> > >    else
> > >      {
> > >        gcc_assert (ncopies == 1 && !slp_node);
> > > -       vect_record_loop_mask (loop_vinfo,
> > > -      &LOOP_VINFO_MASKS (loop_vinfo),
> > > -      1, vectype, NULL);
> > > +       if (vect_can_vectorize_extract_last_with_len_p (vectype))
> > > + vect_record_loop_len (loop_vinfo,
> > > +       &LOOP_VINFO_LENS (loop_vinfo),
> > > +       1, vectype, 1);
> > > +       else
> > > + vect_record_loop_mask (loop_vinfo,
> > > +        &LOOP_VINFO_MASKS (loop_vinfo),
> > > +        1, vectype, NULL);
> > >      }
> > >  }
> > >        /* ???  Enable for loop costing as well.  */
> > > @@ -10339,7 +10363,9 @@ vectorizable_live_operation (vec_info *vinfo,
> > >    gimple *vec_stmt;
> > >    if (slp_node)
> > >      {
> > > -      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> > > +      gcc_assert (!loop_vinfo
> > > +   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> > > +       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
> > >  
> > >        /* Get the correct slp vectorized stmt.  */
> > >        vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> > > @@ -10383,7 +10409,43 @@ vectorizable_live_operation (vec_info *vinfo,
> > >  
> > >        gimple_seq stmts = NULL;
> > >        tree new_tree;
> > > -      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> > > +      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> > > + {
> > > +   /* Emit:
> > > +
> > > +        SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - BIAS - 1>
> > > +
> > > +      where VEC_LHS is the vectorized live-out result and MASK is
> > > +      the loop mask for the final iteration.  */
> > > +   gcc_assert (ncopies == 1 && !slp_node);
> > > +   tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
> > > +   tree len
> > > +     = vect_get_loop_len (loop_vinfo, gsi, &LOOP_VINFO_LENS (loop_vinfo),
> > > + 1, vectype, 0, 0);
> > > +
> > > +   /* BIAS + 1.  */
> > > +   signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > > +   tree bias_one
> > > +     = size_binop (PLUS_EXPR, build_int_cst (TREE_TYPE (len), biasval),
> > > +   build_one_cst (TREE_TYPE (len)));
> > > +
> > > +   /* LAST_INDEX = LEN - (BIAS + 1).  */
> > > +   tree last_index
> > > +     = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (len), len, bias_one);
> > > +
> > > +   tree scalar_res = gimple_build (&stmts, CFN_VEC_EXTRACT, scalar_type,
> > > +   vec_lhs_phi, last_index);
> > > +
> > > +   /* Convert the extracted vector element to the scalar type.  */
> > > +   new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> > > +   /* When the original stmt is an assignment but VEC_EXTRACT is not pure
> > > +      or const since it may return a memory result.  We will have to use
> > > +      a virtual definition and in a loop eventually even need to add a
> > > +      virtual PHI. That's not straight-forward so allow to fix this up
> > > +      via renaming.  */
> > > +   vinfo->any_known_not_updated_vssa = true;
> > > + }
> > > +      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> > >  {
> > >    /* Emit:
> > >  
> > > 
> >  
> > 
>  
>
  
juzhe.zhong@rivai.ai Aug. 10, 2023, 12:24 p.m. UTC | #7
Hi,Richi.

>> comments do not match the implementation, but then that should be fixed?

You mean you allow me to change VEC_EXTRACT into ECF_CONST ?
If I can change VEC_EXTRACT into ECF_CONST then this patch can definitely work
No need 'vinfo->any_known_not_updated_vssa = true'.

So, let me conclude:

I can remove 'vinfo->any_known_not_updated_vssa = true' and
set VEC_EXTRACT as ECF_CONST.  Bootstrap and Regtest on X86 pass then send V3 patch.

Am I right?

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-08-10 20:14
To: juzhe.zhong@rivai.ai
CC: gcc-patches; richard.sandiford
Subject: Re: Re: [PATCH V2] VECT: Support loop len control on EXTRACT_LAST vectorization
On Thu, 10 Aug 2023, juzhe.zhong@rivai.ai wrote:
 
> Hi, Richi.
> 
> >> What inserts the required LC SSA PHI in that case?
> 
> Here is the flow how GCC insert LC SSA PHI flow for ARM SVE.
> You can see this following 'vect' dump details:
> https://godbolt.org/z/564o87oz3 
> 
> You can see this following information:
> 
> ;; Created LCSSA PHI: loop_mask_36 = PHI <loop_mask_22(3)>
>   # loop_mask_36 = PHI <loop_mask_22(3)>
>   _25 = .EXTRACT_LAST (loop_mask_36, vect_last_12.8_24);
>   last_17 = _25;
> 
> The '# loop_mask_36 = PHI <loop_mask_22(3)>' is inserted as follows:
> 
> Step 1 - Enter file tree-vectorizer.cc
> In the function pass_vectorize::execute (function *fun): 1358
> 'rewrite_into_loop_closed_ssa' is the key function insert the LC SSA PHI for ARM SVE in mask loop case.
> 
> Step 2 - Investigate more into 'rewrite_into_loop_closed_ssa':
> In file tree-ssa-loop-manip.cc:628, 'rewrite_into_loop_closed_ssa' is directly calling 'rewrite_into_loop_closed_ssa_1'.
> Step 3 - Investigate 'rewrite_into_loop_closed_ssa_1':
> In file tree-ssa-loop-manip.cc:588 which is the function 'find_uses_to_rename' that:
> /* Marks names matching USE_FLAGS that are used outside of the loop they are
>    defined in for rewrite.  Records the set of blocks in which the ssa names are
>    used to USE_BLOCKS.  Record the SSA names that will need exit PHIs in
>    NEED_PHIS.  If CHANGED_BBS is not NULL, scan only blocks in this set.  */
> 
> static void
> find_uses_to_rename (bitmap changed_bbs, bitmap *use_blocks, bitmap need_phis,
>          int use_flags)
> {
>   basic_block bb;
>   unsigned index;
>   bitmap_iterator bi;
> 
>   if (changed_bbs)
>     EXECUTE_IF_SET_IN_BITMAP (changed_bbs, 0, index, bi)
>       {
>   bb = BASIC_BLOCK_FOR_FN (cfun, index);
>   if (bb)
>     find_uses_to_rename_bb (bb, use_blocks, need_phis, use_flags);
>       }
>   else
>     FOR_EACH_BB_FN (bb, cfun)
>       find_uses_to_rename_bb (bb, use_blocks, need_phis, use_flags);
> }
> 
> This function is iterating all blocks of the function to set the BITMAP which SSA need to be renamed then the later function will insert LC SSA for it.
> 
> In file tree-ssa-loop-manip.cc:606 which is the function 'add_exit_phis' that is the real function that is adding LC SSA by calling
> this eventually:
> /* Add a loop-closing PHI for VAR in basic block EXIT.  */
> 
> static void
> add_exit_phi (basic_block exit, tree var)
> {
>   gphi *phi;
>   edge e;
>   edge_iterator ei;
> 
>   /* Check that at least one of the edges entering the EXIT block exits
>      the loop, or a superloop of that loop, that VAR is defined in.  */
>   if (flag_checking)
>     {
>       gimple *def_stmt = SSA_NAME_DEF_STMT (var);
>       basic_block def_bb = gimple_bb (def_stmt);
>       FOR_EACH_EDGE (e, ei, exit->preds)
>   {
>     class loop *aloop = find_common_loop (def_bb->loop_father,
>              e->src->loop_father);
>     if (!flow_bb_inside_loop_p (aloop, e->dest))
>       break;
>   }
>       gcc_assert (e);
>     }
> 
>   phi = create_phi_node (NULL_TREE, exit);
>   create_new_def_for (var, phi, gimple_phi_result_ptr (phi));
>   FOR_EACH_EDGE (e, ei, exit->preds)
>     add_phi_arg (phi, var, e, UNKNOWN_LOCATION);
> 
>   if (dump_file && (dump_flags & TDF_DETAILS))
>     {
>       fprintf (dump_file, ";; Created LCSSA PHI: ");
>       print_gimple_stmt (dump_file, phi, 0, dump_flags);
>     }
> }
> 
> 
> This is how it works for ARM SVE in EXTRACT_LAST. Such flow (rewrite_into_loop_closed_ssa) can always insert LC SSA for RVV which is using length loop.
> 
> However,
> 
> >> I want to know why we don't need this for SVE fully masked loops. 
> 
> Before entering 'rewrite_into_loop_closed_ssa', there is a check here that RVV assertion failed but ARM SVE passed:
> 
>   /* We should not have to update virtual SSA form here but some
>      transforms involve creating new virtual definitions which makes
>      updating difficult.
>      We delay the actual update to the end of the pass but avoid
>      confusing ourselves by forcing need_ssa_update_p () to false.  */
>   unsigned todo = 0;
>   if (need_ssa_update_p (cfun))
>     {
>       gcc_assert (loop_vinfo->any_known_not_updated_vssa);
>       fun->gimple_df->ssa_renaming_needed = false;
>       todo |= TODO_update_ssa_only_virtuals;
>     }
> 
> in tree-vectorizer.cc, function 'vect_transform_loops'
> The assertion (gcc_assert (loop_vinfo->any_known_not_updated_vssa);)
> failed for RVV since it is false.
> 
> The reason why ARM SVE can pass is that the STMT1 before 'vectorizable_live_operation' and STMT2 after vectorization of 'vectorizable_live_operation'
> are both CONST or PURE since ARM SVE is using EXTRACT_LAST, here is the define of 'EXTRACT_LAST' internal function:
> /* Extract the last active element from a vector.  */
> DEF_INTERNAL_OPTAB_FN (EXTRACT_LAST, ECF_CONST | ECF_NOTHROW,
>            extract_last, fold_left)
> 
> You can see 'EXTRACT_LAST' is ECF_CONST.
> 
> Wheras, RVV will fail since it is 'VEC_EXTRACT' which is not ECF_CONST:
> DEF_INTERNAL_OPTAB_FN (VEC_EXTRACT, 0, vec_extract, vec_extract)
> 
> When I changed VEC_EXTRACT into ECF_CONST, we don't need 'vinfo->any_known_not_updated_vssa = true'
> The flow can perfectly work and no different from ARM SVE.
> 
> However, I found we can't make 'VEC_EXTRACT' as ECF_CONST since I found some targets use VEC_EXTRACT, extract element into a memory.
> 
> So.... I use 'vinfo->any_known_not_updated_vssa = true'
> 
> The alternative approach I think is adding IFN_EXTRACT_LAST_LEN as Richard said, and make IFN_EXTRACT_LAST_LEN as ECF_CONST, it can definitely work
> but such pattern is redundant since we can reuse 'VEC_EXTRACT' pattern which is suitable for us.
 
Hmm, I think VEC_EXTRACT and VEC_SET should be ECF_CONST.  Maybe the 
GIMPLE ISEL
comments do not match the implementation, but then that should be fixed?
 
/* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls 
to
   internal function based on vector type of selected expansion.
 
   For vec_set:
 
     VIEW_CONVERT_EXPR<int[4]>(u)[_1] = i_4(D);
   =>
     _7 = u;
     _8 = .VEC_SET (_7, i_4(D), _1);
     u = _8;
  
   For vec_extract:
 
      _3 = VIEW_CONVERT_EXPR<intD.1[4]>(vD.2208)[idx_2(D)];
   =>
      _4 = vD.2208;
      _3 = .VEC_EXTRACT (_4, idx_2(D));  */
 
Richard.
 
> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Biener
> Date: 2023-08-10 19:09
> To: juzhe.zhong@rivai.ai
> CC: gcc-patches; richard.sandiford
> Subject: Re: Re: [PATCH V2] VECT: Support loop len control on EXTRACT_LAST vectorization
> On Thu, 10 Aug 2023, juzhe.zhong@rivai.ai wrote:
>  
> > >> I guess as a temporary thing your approach is OK but we shouldn't
> > >> add these as part of new code - it's supposed to handle legacy
> > >> cases that we didn't fixup yet.
> > 
> > Do you mean we need to fix LC SSA PHI flow so that we don't need to 
> > set vinfo->any_known_not_updated_vssa = true ?
> > 
> > After it's fixed then this patch with removing 'vinfo->any_known_not_updated_vssa = true' is ok for trunk, am I right?
>  
> I want to know why we don't need this for SVE fully masked loops.  What
> inserts the required LC SSA PHI in that case?
>  
> > Thanks.
> > 
> > 
> > juzhe.zhong@rivai.ai
> >  
> > From: Richard Biener
> > Date: 2023-08-10 15:58
> > To: Ju-Zhe Zhong
> > CC: gcc-patches; richard.sandiford
> > Subject: Re: [PATCH V2] VECT: Support loop len control on EXTRACT_LAST vectorization
> > On Thu, 10 Aug 2023, juzhe.zhong@rivai.ai wrote:
> >  
> > > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> > > 
> > > Hi, Richard and Richi.
> > > 
> > > This patch add support live vectorization by VEC_EXTRACT for LEN loop control.
> > > 
> > > Consider this following case:
> > > 
> > > #include <stdint.h>
> > > 
> > > #define EXTRACT_LAST(TYPE) \
> > >   TYPE __attribute__ ((noinline, noclone)) \
> > >   test_##TYPE (TYPE *x, int n, TYPE value) \
> > >   { \
> > >     TYPE last; \
> > >     for (int j = 0; j < n; ++j) \
> > >       { \
> > > last = x[j]; \
> > > x[j] = last * value; \
> > >       } \
> > >     return last; \
> > >   }
> > > 
> > > #define TEST_ALL(T) \
> > >   T (uint8_t) \
> > > 
> > > TEST_ALL (EXTRACT_LAST)
> > > 
> > > ARM SVE IR:
> > > 
> > > Preheader:
> > >   max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
> > > 
> > > Loop:
> > >   ...
> > >   # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
> > >   ...
> > >   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
> > >   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> > >   .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
> > >   ...
> > >   next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
> > >   ...
> > > 
> > > Epilogue:
> > >   _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
> > > 
> > > For RVV since we prefer len in loop control, after this patch for RVV:
> > > 
> > > Loop:
> > >   ...
> > >   loop_len_22 = SELECT_VL;
> > >   vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
> > >   vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> > >   .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
> > >   ...
> > > 
> > > Epilogue:
> > >   _25 = .VEC_EXTRACT (loop_len_22 - 1 - bias, vect_last_12.8_23);
> > > 
> > > Details of this approach:
> > > 
> > > 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p'  to enable live vectorization
> > >             for LEN loop control.
> > >    
> > >    This function we check whether target support:
> > >     - Use LEN as the loop control.
> > >     - Support VEC_EXTRACT optab.
> > > 
> > > 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.
> > > 
> > > 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN - 1 - BIAS).
> > > 
> > > NOTE: This patch set 'vinfo->any_known_not_updated_vssa = true;' since the original STMT is a simple
> > >       assignment wheras VEC_EXTRACT is neither pure nor const function according to internal-fn.def:
> > > 
> > >       DEF_INTERNAL_OPTAB_FN (VEC_EXTRACT, 0, vec_extract, vec_extract)
> > > 
> > >       If we don't set 'vinfo->any_known_not_updated_vssa' as true, it will cause ICE in:
> > > 
> > >         if (need_ssa_update_p (cfun))
> > >           {
> > >             gcc_assert (loop_vinfo->any_known_not_updated_vssa);  ----> Report assertion fail here.
> > >             fun->gimple_df->ssa_renaming_needed = false;
> > >             todo |= TODO_update_ssa_only_virtuals;
> > >           }
> > >       
> > >       I saw there are 2 places set 'vinfo->any_known_not_updated_vssa' as true:
> > >         
> > > - The one is in 'vectorizable_simd_clone_call':
> > > 
> > > /* When the original call is pure or const but the SIMD ABI dictates
> > > an aggregate return we will have to use a virtual definition and
> > > in a loop eventually even need to add a virtual PHI.  That's
> > > not straight-forward so allow to fix this up via renaming.  */
> > >       if (gimple_call_lhs (stmt)
> > >   && !gimple_vdef (stmt)
> > >   && TREE_CODE (TREE_TYPE (TREE_TYPE (bestn->decl))) == ARRAY_TYPE)
> > > vinfo->any_known_not_updated_vssa = true;
> > >        
> > >        - The other is in 'vectorizable_load':
> > >        
> > >         if (memory_access_type == VMAT_LOAD_STORE_LANES)
> > >   vinfo->any_known_not_updated_vssa = true;
> > > 
> > >       It seems that they are the same reason as me doing in 'vectorizable_live_operation'.
> > >       Feel free to correct me if I am wrong.
> >  
> > You should always manually update things.  Did you verify the mask
> > case is handled by this?
> >  
> > There's the odd
> >  
> >       if (stmts)
> >         {
> >           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
> >           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
> >  
> >           /* Remove existing phi from lhs and create one copy from 
> > new_tree.  */
> >           tree lhs_phi = NULL_TREE;
> >           gimple_stmt_iterator gsi;
> >           for (gsi = gsi_start_phis (exit_bb);
> >                !gsi_end_p (gsi); gsi_next (&gsi))
> >             {
> >               gimple *phi = gsi_stmt (gsi);
> >               if ((gimple_phi_arg_def (phi, 0) == lhs))
> >                 {
> >                   remove_phi_node (&gsi, false);
> >                   lhs_phi = gimple_phi_result (phi);
> >                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
> >                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
> >                   break;
> >                 }
> >             }
> >  
> > code but I don't think it will create new LC PHIs for the mask, instead
> > it will break LC SSA as well by removing a PHI?
> >  
> > I guess as a temporary thing your approach is OK but we shouldn't
> > add these as part of new code - it's supposed to handle legacy
> > cases that we didn't fixup yet.
> >  
> > Richard.
> >  
> >  
> >  
> > >       Bootstrap and Regression on X86 passed.
> > > 
> > > gcc/ChangeLog:
> > > 
> > >         * tree-vect-loop.cc (vect_can_vectorize_extract_last_with_len_p): New function.
> > >         (vectorizable_live_operation): Add loop LEN control.
> > > 
> > > ---
> > >  gcc/tree-vect-loop.cc | 74 +++++++++++++++++++++++++++++++++++++++----
> > >  1 file changed, 68 insertions(+), 6 deletions(-)
> > > 
> > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > > index 00058c3c13e..208918f53fb 100644
> > > --- a/gcc/tree-vect-loop.cc
> > > +++ b/gcc/tree-vect-loop.cc
> > > @@ -8964,6 +8964,24 @@ vect_can_vectorize_without_simd_p (code_helper code)
> > >    && vect_can_vectorize_without_simd_p (tree_code (code)));
> > >  }
> > >  
> > > +/* Return true if target supports extract last vectorization with LEN.  */
> > > +
> > > +static bool
> > > +vect_can_vectorize_extract_last_with_len_p (tree vectype)
> > > +{
> > > +  /* Return false if target doesn't support LEN in loop control.  */
> > > +  machine_mode vmode;
> > > +  if (!get_len_load_store_mode (TYPE_MODE (vectype), true).exists (&vmode)
> > > +      || !get_len_load_store_mode (TYPE_MODE (vectype), false).exists (&vmode))
> > > +    return false;
> > > +
> > > +  /* Target need to support VEC_EXTRACT to extract the last active element.  */
> > > +  return convert_optab_handler (vec_extract_optab,
> > > + TYPE_MODE (vectype),
> > > + TYPE_MODE (TREE_TYPE (vectype)))
> > > + != CODE_FOR_nothing;
> > > +}
> > > +
> > >  /* Create vector init for vectorized iv.  */
> > >  static tree
> > >  vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
> > > @@ -10282,7 +10300,8 @@ vectorizable_live_operation (vec_info *vinfo,
> > >        if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> > >  {
> > >    if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> > > -        OPTIMIZE_FOR_SPEED))
> > > +        OPTIMIZE_FOR_SPEED)
> > > +       && !vect_can_vectorize_extract_last_with_len_p (vectype))
> > >      {
> > >        if (dump_enabled_p ())
> > >  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > @@ -10311,9 +10330,14 @@ vectorizable_live_operation (vec_info *vinfo,
> > >    else
> > >      {
> > >        gcc_assert (ncopies == 1 && !slp_node);
> > > -       vect_record_loop_mask (loop_vinfo,
> > > -      &LOOP_VINFO_MASKS (loop_vinfo),
> > > -      1, vectype, NULL);
> > > +       if (vect_can_vectorize_extract_last_with_len_p (vectype))
> > > + vect_record_loop_len (loop_vinfo,
> > > +       &LOOP_VINFO_LENS (loop_vinfo),
> > > +       1, vectype, 1);
> > > +       else
> > > + vect_record_loop_mask (loop_vinfo,
> > > +        &LOOP_VINFO_MASKS (loop_vinfo),
> > > +        1, vectype, NULL);
> > >      }
> > >  }
> > >        /* ???  Enable for loop costing as well.  */
> > > @@ -10339,7 +10363,9 @@ vectorizable_live_operation (vec_info *vinfo,
> > >    gimple *vec_stmt;
> > >    if (slp_node)
> > >      {
> > > -      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> > > +      gcc_assert (!loop_vinfo
> > > +   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> > > +       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
> > >  
> > >        /* Get the correct slp vectorized stmt.  */
> > >        vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> > > @@ -10383,7 +10409,43 @@ vectorizable_live_operation (vec_info *vinfo,
> > >  
> > >        gimple_seq stmts = NULL;
> > >        tree new_tree;
> > > -      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> > > +      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> > > + {
> > > +   /* Emit:
> > > +
> > > +        SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - BIAS - 1>
> > > +
> > > +      where VEC_LHS is the vectorized live-out result and MASK is
> > > +      the loop mask for the final iteration.  */
> > > +   gcc_assert (ncopies == 1 && !slp_node);
> > > +   tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
> > > +   tree len
> > > +     = vect_get_loop_len (loop_vinfo, gsi, &LOOP_VINFO_LENS (loop_vinfo),
> > > + 1, vectype, 0, 0);
> > > +
> > > +   /* BIAS + 1.  */
> > > +   signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > > +   tree bias_one
> > > +     = size_binop (PLUS_EXPR, build_int_cst (TREE_TYPE (len), biasval),
> > > +   build_one_cst (TREE_TYPE (len)));
> > > +
> > > +   /* LAST_INDEX = LEN - (BIAS + 1).  */
> > > +   tree last_index
> > > +     = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (len), len, bias_one);
> > > +
> > > +   tree scalar_res = gimple_build (&stmts, CFN_VEC_EXTRACT, scalar_type,
> > > +   vec_lhs_phi, last_index);
> > > +
> > > +   /* Convert the extracted vector element to the scalar type.  */
> > > +   new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> > > +   /* When the original stmt is an assignment but VEC_EXTRACT is not pure
> > > +      or const since it may return a memory result.  We will have to use
> > > +      a virtual definition and in a loop eventually even need to add a
> > > +      virtual PHI. That's not straight-forward so allow to fix this up
> > > +      via renaming.  */
> > > +   vinfo->any_known_not_updated_vssa = true;
> > > + }
> > > +      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> > >  {
> > >    /* Emit:
> > >  
> > > 
> >  
> > 
>  
> 
 
-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
  
Robin Dapp Aug. 10, 2023, 12:33 p.m. UTC | #8
> Hmm, I think VEC_EXTRACT and VEC_SET should be ECF_CONST.  Maybe the 
> GIMPLE ISEL
> comments do not match the implementation, but then that should be fixed?
> 
> /* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls 
> to
>    internal function based on vector type of selected expansion.
> 
>    For vec_set:
> 
>      VIEW_CONVERT_EXPR<int[4]>(u)[_1] = i_4(D);
>    =>
>      _7 = u;
>      _8 = .VEC_SET (_7, i_4(D), _1);
>      u = _8;
>   
>    For vec_extract:
> 
>       _3 = VIEW_CONVERT_EXPR<intD.1[4]>(vD.2208)[idx_2(D)];
>    =>
>       _4 = vD.2208;
>       _3 = .VEC_EXTRACT (_4, idx_2(D));  */
> 

I probably just forgot to set ECF_CONST in the recent isel patch
for vec_extract.

Regards
 Robin
  
Richard Biener Aug. 10, 2023, 12:35 p.m. UTC | #9
On Thu, 10 Aug 2023, Robin Dapp wrote:

> > Hmm, I think VEC_EXTRACT and VEC_SET should be ECF_CONST.  Maybe the 
> > GIMPLE ISEL
> > comments do not match the implementation, but then that should be fixed?
> > 
> > /* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls 
> > to
> >    internal function based on vector type of selected expansion.
> > 
> >    For vec_set:
> > 
> >      VIEW_CONVERT_EXPR<int[4]>(u)[_1] = i_4(D);
> >    =>
> >      _7 = u;
> >      _8 = .VEC_SET (_7, i_4(D), _1);
> >      u = _8;
> >   
> >    For vec_extract:
> > 
> >       _3 = VIEW_CONVERT_EXPR<intD.1[4]>(vD.2208)[idx_2(D)];
> >    =>
> >       _4 = vD.2208;
> >       _3 = .VEC_EXTRACT (_4, idx_2(D));  */
> > 
> 
> I probably just forgot to set ECF_CONST in the recent isel patch
> for vec_extract.

I'm testing a patch adjusting a few IFNs where that was missed.

Richard.
  

Patch

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 00058c3c13e..208918f53fb 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -8964,6 +8964,24 @@  vect_can_vectorize_without_simd_p (code_helper code)
 	  && vect_can_vectorize_without_simd_p (tree_code (code)));
 }
 
+/* Return true if target supports extract last vectorization with LEN.  */
+
+static bool
+vect_can_vectorize_extract_last_with_len_p (tree vectype)
+{
+  /* Return false if target doesn't support LEN in loop control.  */
+  machine_mode vmode;
+  if (!get_len_load_store_mode (TYPE_MODE (vectype), true).exists (&vmode)
+      || !get_len_load_store_mode (TYPE_MODE (vectype), false).exists (&vmode))
+    return false;
+
+  /* Target need to support VEC_EXTRACT to extract the last active element.  */
+  return convert_optab_handler (vec_extract_optab,
+				TYPE_MODE (vectype),
+				TYPE_MODE (TREE_TYPE (vectype)))
+	 != CODE_FOR_nothing;
+}
+
 /* Create vector init for vectorized iv.  */
 static tree
 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
@@ -10282,7 +10300,8 @@  vectorizable_live_operation (vec_info *vinfo,
       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
 	{
 	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
-					       OPTIMIZE_FOR_SPEED))
+					       OPTIMIZE_FOR_SPEED)
+	      && !vect_can_vectorize_extract_last_with_len_p (vectype))
 	    {
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -10311,9 +10330,14 @@  vectorizable_live_operation (vec_info *vinfo,
 	  else
 	    {
 	      gcc_assert (ncopies == 1 && !slp_node);
-	      vect_record_loop_mask (loop_vinfo,
-				     &LOOP_VINFO_MASKS (loop_vinfo),
-				     1, vectype, NULL);
+	      if (vect_can_vectorize_extract_last_with_len_p (vectype))
+		vect_record_loop_len (loop_vinfo,
+				      &LOOP_VINFO_LENS (loop_vinfo),
+				      1, vectype, 1);
+	      else
+		vect_record_loop_mask (loop_vinfo,
+				       &LOOP_VINFO_MASKS (loop_vinfo),
+				       1, vectype, NULL);
 	    }
 	}
       /* ???  Enable for loop costing as well.  */
@@ -10339,7 +10363,9 @@  vectorizable_live_operation (vec_info *vinfo,
   gimple *vec_stmt;
   if (slp_node)
     {
-      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+      gcc_assert (!loop_vinfo
+		  || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+		      && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
 
       /* Get the correct slp vectorized stmt.  */
       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
@@ -10383,7 +10409,43 @@  vectorizable_live_operation (vec_info *vinfo,
 
       gimple_seq stmts = NULL;
       tree new_tree;
-      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+	{
+	  /* Emit:
+
+	       SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - BIAS - 1>
+
+	     where VEC_LHS is the vectorized live-out result and MASK is
+	     the loop mask for the final iteration.  */
+	  gcc_assert (ncopies == 1 && !slp_node);
+	  tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
+	  tree len
+	    = vect_get_loop_len (loop_vinfo, gsi, &LOOP_VINFO_LENS (loop_vinfo),
+				 1, vectype, 0, 0);
+
+	  /* BIAS + 1.  */
+	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+	  tree bias_one
+	    = size_binop (PLUS_EXPR, build_int_cst (TREE_TYPE (len), biasval),
+			  build_one_cst (TREE_TYPE (len)));
+
+	  /* LAST_INDEX = LEN - (BIAS + 1).  */
+	  tree last_index
+	    = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (len), len, bias_one);
+
+	  tree scalar_res = gimple_build (&stmts, CFN_VEC_EXTRACT, scalar_type,
+					  vec_lhs_phi, last_index);
+
+	  /* Convert the extracted vector element to the scalar type.  */
+	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+	  /* When the original stmt is an assignment but VEC_EXTRACT is not pure
+	     or const since it may return a memory result.  We will have to use
+	     a virtual definition and in a loop eventually even need to add a
+	     virtual PHI. That's not straight-forward so allow to fix this up
+	     via renaming.  */
+	  vinfo->any_known_not_updated_vssa = true;
+	}
+      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 	{
 	  /* Emit: