[V2] VECT: Apply COND_LEN_* into vectorizable_operation

Message ID 20230712103621.47696-1-juzhe.zhong@rivai.ai
State Unresolved
Headers
Series [V2] VECT: Apply COND_LEN_* into vectorizable_operation |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

juzhe.zhong@rivai.ai July 12, 2023, 10:36 a.m. UTC
  From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

Hi, Richard and Richi.
As we disscussed before, COND_LEN_* patterns were added for multiple situations.
This patch apply CON_LEN_* for the following situation:

Support for the situation that in "vectorizable_operation":
  /* If operating on inactive elements could generate spurious traps,
     we need to restrict the operation to active lanes.  Note that this
     specifically doesn't apply to unhoisted invariants, since they
     operate on the same value for every lane.

     Similarly, if this operation is part of a reduction, a fully-masked
     loop should only change the active lanes of the reduction chain,
     keeping the inactive lanes as-is.  */
  bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt))
			    || reduc_idx >= 0);

For mask_out_inactive is true with length loop control.

So, we can these 2 following cases:

1. Integer division:

   #define TEST_TYPE(TYPE) 				\
   __attribute__((noipa))				\
   void vrem_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n)	\
   {							\
     for (int i = 0; i < n; i++)				\
       dst[i] = a[i] % b[i];				\
   }
   #define TEST_ALL()	\
   TEST_TYPE(int8_t)	\
   TEST_ALL()

With this patch:
  
  _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
  ivtmp_45 = _61 * 4;
  vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
  vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
  vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
  .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);

2. Floating-point arithmetic **WITHOUT** -ffast-math
  
   #define TEST_TYPE(TYPE) 				\
   __attribute__((noipa))				\
   void vadd_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n)	\
   {							\
     for (int i = 0; i < n; i++)				\
       dst[i] = a[i] + b[i];				\
   }
   #define TEST_ALL()	\
   TEST_TYPE(float)	\
   TEST_ALL()

With this patch:
   
  _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
  ivtmp_45 = _61 * 4;
  vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
  vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
  vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
  .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);

With this patch, we can make sure operations won't trap for elements that "mask_out_inactive".

gcc/ChangeLog:

        * internal-fn.cc (FOR_EACH_CODE_MAPPING): Adapt for COND_LEN_* support.
        (CASE): Ditto.
        (get_conditional_len_internal_fn): New function.
        * internal-fn.h (get_conditional_len_internal_fn): Ditto.
        * tree-vect-stmts.cc (vectorizable_operation): Adapt for COND_LEN_* support.

---
 gcc/internal-fn.cc     | 65 ++++++++++++++++++++++++++++++------------
 gcc/internal-fn.h      |  1 +
 gcc/tree-vect-stmts.cc | 48 ++++++++++++++++++++++++-------
 3 files changed, 85 insertions(+), 29 deletions(-)
  

Comments

Richard Sandiford July 12, 2023, 10:53 a.m. UTC | #1
juzhe.zhong@rivai.ai writes:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> Hi, Richard and Richi.
> As we disscussed before, COND_LEN_* patterns were added for multiple situations.
> This patch apply CON_LEN_* for the following situation:
>
> Support for the situation that in "vectorizable_operation":
>   /* If operating on inactive elements could generate spurious traps,
>      we need to restrict the operation to active lanes.  Note that this
>      specifically doesn't apply to unhoisted invariants, since they
>      operate on the same value for every lane.
>
>      Similarly, if this operation is part of a reduction, a fully-masked
>      loop should only change the active lanes of the reduction chain,
>      keeping the inactive lanes as-is.  */
>   bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt))
> 			    || reduc_idx >= 0);
>
> For mask_out_inactive is true with length loop control.
>
> So, we can these 2 following cases:
>
> 1. Integer division:
>
>    #define TEST_TYPE(TYPE) 				\
>    __attribute__((noipa))				\
>    void vrem_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n)	\
>    {							\
>      for (int i = 0; i < n; i++)				\
>        dst[i] = a[i] % b[i];				\
>    }
>    #define TEST_ALL()	\
>    TEST_TYPE(int8_t)	\
>    TEST_ALL()
>
> With this patch:
>   
>   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
>   ivtmp_45 = _61 * 4;
>   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
>   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
>   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
>   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
>
> 2. Floating-point arithmetic **WITHOUT** -ffast-math
>   
>    #define TEST_TYPE(TYPE) 				\
>    __attribute__((noipa))				\
>    void vadd_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n)	\
>    {							\
>      for (int i = 0; i < n; i++)				\
>        dst[i] = a[i] + b[i];				\
>    }
>    #define TEST_ALL()	\
>    TEST_TYPE(float)	\
>    TEST_ALL()
>
> With this patch:
>    
>   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
>   ivtmp_45 = _61 * 4;
>   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
>   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
>   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
>   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
>
> With this patch, we can make sure operations won't trap for elements that "mask_out_inactive".
>
> gcc/ChangeLog:
>
>         * internal-fn.cc (FOR_EACH_CODE_MAPPING): Adapt for COND_LEN_* support.
>         (CASE): Ditto.
>         (get_conditional_len_internal_fn): New function.
>         * internal-fn.h (get_conditional_len_internal_fn): Ditto.
>         * tree-vect-stmts.cc (vectorizable_operation): Adapt for COND_LEN_* support.
>
> ---
>  gcc/internal-fn.cc     | 65 ++++++++++++++++++++++++++++++------------
>  gcc/internal-fn.h      |  1 +
>  gcc/tree-vect-stmts.cc | 48 ++++++++++++++++++++++++-------
>  3 files changed, 85 insertions(+), 29 deletions(-)
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index f9aaf66cf2a..7e3a8cc8412 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4276,23 +4276,24 @@ static void (*const internal_fn_expanders[]) (internal_fn, gcall *) = {
>    0
>  };
>  
> -/* Invoke T(CODE, IFN) for each conditional function IFN that maps to a
> -   tree code CODE.  */
> +/* Invoke T(CODE, SUFFIX) for each conditional function IFN_COND_##SUFFIX
> +   that maps to a tree code CODE.  There is also an IFN_COND_LEN_##SUFFIX
> +   for each such IFN_COND_##SUFFIX.  */
>  #define FOR_EACH_CODE_MAPPING(T) \
> -  T (PLUS_EXPR, IFN_COND_ADD) \
> -  T (MINUS_EXPR, IFN_COND_SUB) \
> -  T (MULT_EXPR, IFN_COND_MUL) \
> -  T (TRUNC_DIV_EXPR, IFN_COND_DIV) \
> -  T (TRUNC_MOD_EXPR, IFN_COND_MOD) \
> -  T (RDIV_EXPR, IFN_COND_RDIV) \
> -  T (MIN_EXPR, IFN_COND_MIN) \
> -  T (MAX_EXPR, IFN_COND_MAX) \
> -  T (BIT_AND_EXPR, IFN_COND_AND) \
> -  T (BIT_IOR_EXPR, IFN_COND_IOR) \
> -  T (BIT_XOR_EXPR, IFN_COND_XOR) \
> -  T (LSHIFT_EXPR, IFN_COND_SHL) \
> -  T (RSHIFT_EXPR, IFN_COND_SHR) \
> -  T (NEGATE_EXPR, IFN_COND_NEG)
> +  T (PLUS_EXPR, ADD) \
> +  T (MINUS_EXPR, SUB) \
> +  T (MULT_EXPR, MUL) \
> +  T (TRUNC_DIV_EXPR, DIV) \
> +  T (TRUNC_MOD_EXPR, MOD) \
> +  T (RDIV_EXPR, RDIV) \
> +  T (MIN_EXPR, MIN) \
> +  T (MAX_EXPR, MAX) \
> +  T (BIT_AND_EXPR, AND) \
> +  T (BIT_IOR_EXPR, IOR) \
> +  T (BIT_XOR_EXPR, XOR) \
> +  T (LSHIFT_EXPR, SHL) \
> +  T (RSHIFT_EXPR, SHR) \
> +  T (NEGATE_EXPR, NEG)
>  
>  /* Return a function that only performs CODE when a certain condition is met
>     and that uses a given fallback value otherwise.  For example, if CODE is
> @@ -4313,7 +4314,7 @@ get_conditional_internal_fn (tree_code code)
>  {
>    switch (code)
>      {
> -#define CASE(CODE, IFN) case CODE: return IFN;
> +#define CASE(CODE, IFN) case CODE: return IFN_COND_##IFN;
>        FOR_EACH_CODE_MAPPING(CASE)
>  #undef CASE
>      default:
> @@ -4329,7 +4330,7 @@ conditional_internal_fn_code (internal_fn ifn)
>  {
>    switch (ifn)
>      {
> -#define CASE(CODE, IFN) case IFN: return CODE;
> +#define CASE(CODE, IFN) case IFN_COND_##IFN: return CODE;
>        FOR_EACH_CODE_MAPPING(CASE)
>  #undef CASE
>      default:
> @@ -4337,6 +4338,34 @@ conditional_internal_fn_code (internal_fn ifn)
>      }
>  }
>  
> +/* Return a function that only performs CODE when a certain condition is met
> +   and that uses a given fallback value otherwise.  For example, if CODE is

This describes COND_* rather than COND_LEN_*.  How about:

/* Like get_conditional_internal_fn, but return a function that
   additionally restricts the operation to the leading elements
   of a vector.  The number of elements to process is given by
   a length and bias pair, as for IFN_LOAD_LEN.  The values of
   the remaining elements are undefined.

   For example, if CODE is [...]

(Please confirm that the last sentence of the first paragraph is true.)

> +   a binary operation associated with conditional function FN:
> +
> +     LHS = FN (COND, A, B, ELSE, LEN, BIAS)
> +
> +   is equivalent to the C expression:
> +
> +     for (int i = 0; i < LEN + BIAS; i++)
> +       LHS[i] = COND[i] ? A[i] CODE B[i] : ELSE[i];
> +
> +   operating elementwise if the operands are vectors.
> +

The "operating elementwise" doesn't make sense in this context,
since the example is a loop that always operates elementwise.
Seems better to drop that line.

OK with those changes, thanks.

Richard

> +   Return IFN_LAST if no such function exists.  */
> +
> +internal_fn
> +get_conditional_len_internal_fn (tree_code code)
> +{
> +  switch (code)
> +    {
> +#define CASE(CODE, IFN) case CODE: return IFN_COND_LEN_##IFN;
> +      FOR_EACH_CODE_MAPPING(CASE)
> +#undef CASE
> +    default:
> +      return IFN_LAST;
> +    }
> +}
> +
>  /* Invoke T(IFN) for each internal function IFN that also has an
>     IFN_COND_* form.  */
>  #define FOR_EACH_COND_FN_PAIR(T) \
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 4234bbfed87..dd1bab0bddf 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -224,6 +224,7 @@ extern bool set_edom_supported_p (void);
>  
>  extern internal_fn get_conditional_internal_fn (tree_code);
>  extern internal_fn get_conditional_internal_fn (internal_fn);
> +extern internal_fn get_conditional_len_internal_fn (tree_code);
>  extern tree_code conditional_internal_fn_code (internal_fn);
>  extern internal_fn get_unconditional_internal_fn (internal_fn);
>  extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 10e71178ce7..dd24f017235 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -6711,7 +6711,9 @@ vectorizable_operation (vec_info *vinfo,
>  
>    int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
>    vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
> +  vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
>    internal_fn cond_fn = get_conditional_internal_fn (code);
> +  internal_fn cond_len_fn = get_conditional_len_internal_fn (code);
>  
>    /* If operating on inactive elements could generate spurious traps,
>       we need to restrict the operation to active lanes.  Note that this
> @@ -6730,9 +6732,17 @@ vectorizable_operation (vec_info *vinfo,
>  	  && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
>  	  && mask_out_inactive)
>  	{
> -	  if (cond_fn == IFN_LAST
> -	      || !direct_internal_fn_supported_p (cond_fn, vectype,
> -						  OPTIMIZE_FOR_SPEED))
> +	  if (cond_fn != IFN_LAST
> +	      && direct_internal_fn_supported_p (cond_fn, vectype,
> +						 OPTIMIZE_FOR_SPEED))
> +	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> +				   vectype, NULL);
> +	  else if (cond_len_fn != IFN_LAST
> +		   && direct_internal_fn_supported_p (cond_len_fn, vectype,
> +						      OPTIMIZE_FOR_SPEED))
> +	    vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num, vectype,
> +				  1);
> +	  else
>  	    {
>  	      if (dump_enabled_p ())
>  		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -6740,9 +6750,6 @@ vectorizable_operation (vec_info *vinfo,
>  				 " conditional operation is available.\n");
>  	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>  	    }
> -	  else
> -	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> -				   vectype, NULL);
>  	}
>  
>        /* Put types on constant and invariant SLP children.  */
> @@ -6805,6 +6812,7 @@ vectorizable_operation (vec_info *vinfo,
>                       "transform binary/unary operation.\n");
>  
>    bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> +  bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
>  
>    /* POINTER_DIFF_EXPR has pointer arguments which are vectorized as
>       vectors with unsigned elements, but the result is signed.  So, we
> @@ -6971,11 +6979,16 @@ vectorizable_operation (vec_info *vinfo,
>  	  gimple_assign_set_lhs (new_stmt, new_temp);
>  	  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>  	}
> -      else if (masked_loop_p && mask_out_inactive)
> +      else if ((masked_loop_p || len_loop_p) && mask_out_inactive)
>  	{
> -	  tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> -					  vec_num * ncopies, vectype, i);
> -	  auto_vec<tree> vops (5);
> +	  tree mask;
> +	  if (masked_loop_p)
> +	    mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> +				       vec_num * ncopies, vectype, i);
> +	  else
> +	    /* Dummy mask.  */
> +	    mask = build_minus_one_cst (truth_type_for (vectype));
> +	  auto_vec<tree> vops (6);
>  	  vops.quick_push (mask);
>  	  vops.quick_push (vop0);
>  	  if (vop1)
> @@ -6995,7 +7008,20 @@ vectorizable_operation (vec_info *vinfo,
>  		(cond_fn, vectype, vops.length () - 1, &vops[1]);
>  	      vops.quick_push (else_value);
>  	    }
> -	  gcall *call = gimple_build_call_internal_vec (cond_fn, vops);
> +	  if (len_loop_p)
> +	    {
> +	      tree len = vect_get_loop_len (loop_vinfo, gsi, lens,
> +					    vec_num * ncopies, vectype, i, 1);
> +	      signed char biasval
> +		= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +	      tree bias = build_int_cst (intQI_type_node, biasval);
> +	      vops.quick_push (len);
> +	      vops.quick_push (bias);
> +	    }
> +	  gcall *call
> +	    = gimple_build_call_internal_vec (masked_loop_p ? cond_fn
> +							    : cond_len_fn,
> +					      vops);
>  	  new_temp = make_ssa_name (vec_dest, call);
>  	  gimple_call_set_lhs (call, new_temp);
>  	  gimple_call_set_nothrow (call, true);
  
juzhe.zhong@rivai.ai July 12, 2023, 10:54 a.m. UTC | #2
Thank you so much.
I am gonna wait for Richi's final approval.

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-07-12 18:53
To: juzhe.zhong
CC: gcc-patches; rguenther
Subject: Re: [PATCH V2] VECT: Apply COND_LEN_* into vectorizable_operation
juzhe.zhong@rivai.ai writes:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> Hi, Richard and Richi.
> As we disscussed before, COND_LEN_* patterns were added for multiple situations.
> This patch apply CON_LEN_* for the following situation:
>
> Support for the situation that in "vectorizable_operation":
>   /* If operating on inactive elements could generate spurious traps,
>      we need to restrict the operation to active lanes.  Note that this
>      specifically doesn't apply to unhoisted invariants, since they
>      operate on the same value for every lane.
>
>      Similarly, if this operation is part of a reduction, a fully-masked
>      loop should only change the active lanes of the reduction chain,
>      keeping the inactive lanes as-is.  */
>   bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt))
>     || reduc_idx >= 0);
>
> For mask_out_inactive is true with length loop control.
>
> So, we can these 2 following cases:
>
> 1. Integer division:
>
>    #define TEST_TYPE(TYPE) \
>    __attribute__((noipa)) \
>    void vrem_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n) \
>    { \
>      for (int i = 0; i < n; i++) \
>        dst[i] = a[i] % b[i]; \
>    }
>    #define TEST_ALL() \
>    TEST_TYPE(int8_t) \
>    TEST_ALL()
>
> With this patch:
>   
>   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
>   ivtmp_45 = _61 * 4;
>   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
>   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
>   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
>   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
>
> 2. Floating-point arithmetic **WITHOUT** -ffast-math
>   
>    #define TEST_TYPE(TYPE) \
>    __attribute__((noipa)) \
>    void vadd_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n) \
>    { \
>      for (int i = 0; i < n; i++) \
>        dst[i] = a[i] + b[i]; \
>    }
>    #define TEST_ALL() \
>    TEST_TYPE(float) \
>    TEST_ALL()
>
> With this patch:
>    
>   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
>   ivtmp_45 = _61 * 4;
>   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
>   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
>   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
>   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
>
> With this patch, we can make sure operations won't trap for elements that "mask_out_inactive".
>
> gcc/ChangeLog:
>
>         * internal-fn.cc (FOR_EACH_CODE_MAPPING): Adapt for COND_LEN_* support.
>         (CASE): Ditto.
>         (get_conditional_len_internal_fn): New function.
>         * internal-fn.h (get_conditional_len_internal_fn): Ditto.
>         * tree-vect-stmts.cc (vectorizable_operation): Adapt for COND_LEN_* support.
>
> ---
>  gcc/internal-fn.cc     | 65 ++++++++++++++++++++++++++++++------------
>  gcc/internal-fn.h      |  1 +
>  gcc/tree-vect-stmts.cc | 48 ++++++++++++++++++++++++-------
>  3 files changed, 85 insertions(+), 29 deletions(-)
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index f9aaf66cf2a..7e3a8cc8412 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4276,23 +4276,24 @@ static void (*const internal_fn_expanders[]) (internal_fn, gcall *) = {
>    0
>  };
>  
> -/* Invoke T(CODE, IFN) for each conditional function IFN that maps to a
> -   tree code CODE.  */
> +/* Invoke T(CODE, SUFFIX) for each conditional function IFN_COND_##SUFFIX
> +   that maps to a tree code CODE.  There is also an IFN_COND_LEN_##SUFFIX
> +   for each such IFN_COND_##SUFFIX.  */
>  #define FOR_EACH_CODE_MAPPING(T) \
> -  T (PLUS_EXPR, IFN_COND_ADD) \
> -  T (MINUS_EXPR, IFN_COND_SUB) \
> -  T (MULT_EXPR, IFN_COND_MUL) \
> -  T (TRUNC_DIV_EXPR, IFN_COND_DIV) \
> -  T (TRUNC_MOD_EXPR, IFN_COND_MOD) \
> -  T (RDIV_EXPR, IFN_COND_RDIV) \
> -  T (MIN_EXPR, IFN_COND_MIN) \
> -  T (MAX_EXPR, IFN_COND_MAX) \
> -  T (BIT_AND_EXPR, IFN_COND_AND) \
> -  T (BIT_IOR_EXPR, IFN_COND_IOR) \
> -  T (BIT_XOR_EXPR, IFN_COND_XOR) \
> -  T (LSHIFT_EXPR, IFN_COND_SHL) \
> -  T (RSHIFT_EXPR, IFN_COND_SHR) \
> -  T (NEGATE_EXPR, IFN_COND_NEG)
> +  T (PLUS_EXPR, ADD) \
> +  T (MINUS_EXPR, SUB) \
> +  T (MULT_EXPR, MUL) \
> +  T (TRUNC_DIV_EXPR, DIV) \
> +  T (TRUNC_MOD_EXPR, MOD) \
> +  T (RDIV_EXPR, RDIV) \
> +  T (MIN_EXPR, MIN) \
> +  T (MAX_EXPR, MAX) \
> +  T (BIT_AND_EXPR, AND) \
> +  T (BIT_IOR_EXPR, IOR) \
> +  T (BIT_XOR_EXPR, XOR) \
> +  T (LSHIFT_EXPR, SHL) \
> +  T (RSHIFT_EXPR, SHR) \
> +  T (NEGATE_EXPR, NEG)
>  
>  /* Return a function that only performs CODE when a certain condition is met
>     and that uses a given fallback value otherwise.  For example, if CODE is
> @@ -4313,7 +4314,7 @@ get_conditional_internal_fn (tree_code code)
>  {
>    switch (code)
>      {
> -#define CASE(CODE, IFN) case CODE: return IFN;
> +#define CASE(CODE, IFN) case CODE: return IFN_COND_##IFN;
>        FOR_EACH_CODE_MAPPING(CASE)
>  #undef CASE
>      default:
> @@ -4329,7 +4330,7 @@ conditional_internal_fn_code (internal_fn ifn)
>  {
>    switch (ifn)
>      {
> -#define CASE(CODE, IFN) case IFN: return CODE;
> +#define CASE(CODE, IFN) case IFN_COND_##IFN: return CODE;
>        FOR_EACH_CODE_MAPPING(CASE)
>  #undef CASE
>      default:
> @@ -4337,6 +4338,34 @@ conditional_internal_fn_code (internal_fn ifn)
>      }
>  }
>  
> +/* Return a function that only performs CODE when a certain condition is met
> +   and that uses a given fallback value otherwise.  For example, if CODE is
 
This describes COND_* rather than COND_LEN_*.  How about:
 
/* Like get_conditional_internal_fn, but return a function that
   additionally restricts the operation to the leading elements
   of a vector.  The number of elements to process is given by
   a length and bias pair, as for IFN_LOAD_LEN.  The values of
   the remaining elements are undefined.
 
   For example, if CODE is [...]
 
(Please confirm that the last sentence of the first paragraph is true.)
 
> +   a binary operation associated with conditional function FN:
> +
> +     LHS = FN (COND, A, B, ELSE, LEN, BIAS)
> +
> +   is equivalent to the C expression:
> +
> +     for (int i = 0; i < LEN + BIAS; i++)
> +       LHS[i] = COND[i] ? A[i] CODE B[i] : ELSE[i];
> +
> +   operating elementwise if the operands are vectors.
> +
 
The "operating elementwise" doesn't make sense in this context,
since the example is a loop that always operates elementwise.
Seems better to drop that line.
 
OK with those changes, thanks.
 
Richard
 
> +   Return IFN_LAST if no such function exists.  */
> +
> +internal_fn
> +get_conditional_len_internal_fn (tree_code code)
> +{
> +  switch (code)
> +    {
> +#define CASE(CODE, IFN) case CODE: return IFN_COND_LEN_##IFN;
> +      FOR_EACH_CODE_MAPPING(CASE)
> +#undef CASE
> +    default:
> +      return IFN_LAST;
> +    }
> +}
> +
>  /* Invoke T(IFN) for each internal function IFN that also has an
>     IFN_COND_* form.  */
>  #define FOR_EACH_COND_FN_PAIR(T) \
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 4234bbfed87..dd1bab0bddf 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -224,6 +224,7 @@ extern bool set_edom_supported_p (void);
>  
>  extern internal_fn get_conditional_internal_fn (tree_code);
>  extern internal_fn get_conditional_internal_fn (internal_fn);
> +extern internal_fn get_conditional_len_internal_fn (tree_code);
>  extern tree_code conditional_internal_fn_code (internal_fn);
>  extern internal_fn get_unconditional_internal_fn (internal_fn);
>  extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 10e71178ce7..dd24f017235 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -6711,7 +6711,9 @@ vectorizable_operation (vec_info *vinfo,
>  
>    int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
>    vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
> +  vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
>    internal_fn cond_fn = get_conditional_internal_fn (code);
> +  internal_fn cond_len_fn = get_conditional_len_internal_fn (code);
>  
>    /* If operating on inactive elements could generate spurious traps,
>       we need to restrict the operation to active lanes.  Note that this
> @@ -6730,9 +6732,17 @@ vectorizable_operation (vec_info *vinfo,
>    && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
>    && mask_out_inactive)
>  {
> -   if (cond_fn == IFN_LAST
> -       || !direct_internal_fn_supported_p (cond_fn, vectype,
> -   OPTIMIZE_FOR_SPEED))
> +   if (cond_fn != IFN_LAST
> +       && direct_internal_fn_supported_p (cond_fn, vectype,
> + OPTIMIZE_FOR_SPEED))
> +     vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> +    vectype, NULL);
> +   else if (cond_len_fn != IFN_LAST
> +    && direct_internal_fn_supported_p (cond_len_fn, vectype,
> +       OPTIMIZE_FOR_SPEED))
> +     vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num, vectype,
> +   1);
> +   else
>      {
>        if (dump_enabled_p ())
>  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -6740,9 +6750,6 @@ vectorizable_operation (vec_info *vinfo,
>  " conditional operation is available.\n");
>        LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>      }
> -   else
> -     vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> -    vectype, NULL);
>  }
>  
>        /* Put types on constant and invariant SLP children.  */
> @@ -6805,6 +6812,7 @@ vectorizable_operation (vec_info *vinfo,
>                       "transform binary/unary operation.\n");
>  
>    bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> +  bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
>  
>    /* POINTER_DIFF_EXPR has pointer arguments which are vectorized as
>       vectors with unsigned elements, but the result is signed.  So, we
> @@ -6971,11 +6979,16 @@ vectorizable_operation (vec_info *vinfo,
>    gimple_assign_set_lhs (new_stmt, new_temp);
>    vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>  }
> -      else if (masked_loop_p && mask_out_inactive)
> +      else if ((masked_loop_p || len_loop_p) && mask_out_inactive)
>  {
> -   tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> -   vec_num * ncopies, vectype, i);
> -   auto_vec<tree> vops (5);
> +   tree mask;
> +   if (masked_loop_p)
> +     mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> +        vec_num * ncopies, vectype, i);
> +   else
> +     /* Dummy mask.  */
> +     mask = build_minus_one_cst (truth_type_for (vectype));
> +   auto_vec<tree> vops (6);
>    vops.quick_push (mask);
>    vops.quick_push (vop0);
>    if (vop1)
> @@ -6995,7 +7008,20 @@ vectorizable_operation (vec_info *vinfo,
>  (cond_fn, vectype, vops.length () - 1, &vops[1]);
>        vops.quick_push (else_value);
>      }
> -   gcall *call = gimple_build_call_internal_vec (cond_fn, vops);
> +   if (len_loop_p)
> +     {
> +       tree len = vect_get_loop_len (loop_vinfo, gsi, lens,
> +     vec_num * ncopies, vectype, i, 1);
> +       signed char biasval
> + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +       tree bias = build_int_cst (intQI_type_node, biasval);
> +       vops.quick_push (len);
> +       vops.quick_push (bias);
> +     }
> +   gcall *call
> +     = gimple_build_call_internal_vec (masked_loop_p ? cond_fn
> +     : cond_len_fn,
> +       vops);
>    new_temp = make_ssa_name (vec_dest, call);
>    gimple_call_set_lhs (call, new_temp);
>    gimple_call_set_nothrow (call, true);
  
Richard Biener July 12, 2023, 10:59 a.m. UTC | #3
On Wed, 12 Jul 2023, juzhe.zhong@rivai.ai wrote:

> Thank you so much.
> I am gonna wait for Richi's final approval.

It's good enough when either of us approves unless we explicitely ask
to wait for somebody else.

LGTM anyway.

Richard.

> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Sandiford
> Date: 2023-07-12 18:53
> To: juzhe.zhong
> CC: gcc-patches; rguenther
> Subject: Re: [PATCH V2] VECT: Apply COND_LEN_* into vectorizable_operation
> juzhe.zhong@rivai.ai writes:
> > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> >
> > Hi, Richard and Richi.
> > As we disscussed before, COND_LEN_* patterns were added for multiple situations.
> > This patch apply CON_LEN_* for the following situation:
> >
> > Support for the situation that in "vectorizable_operation":
> >   /* If operating on inactive elements could generate spurious traps,
> >      we need to restrict the operation to active lanes.  Note that this
> >      specifically doesn't apply to unhoisted invariants, since they
> >      operate on the same value for every lane.
> >
> >      Similarly, if this operation is part of a reduction, a fully-masked
> >      loop should only change the active lanes of the reduction chain,
> >      keeping the inactive lanes as-is.  */
> >   bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt))
> >     || reduc_idx >= 0);
> >
> > For mask_out_inactive is true with length loop control.
> >
> > So, we can these 2 following cases:
> >
> > 1. Integer division:
> >
> >    #define TEST_TYPE(TYPE) \
> >    __attribute__((noipa)) \
> >    void vrem_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n) \
> >    { \
> >      for (int i = 0; i < n; i++) \
> >        dst[i] = a[i] % b[i]; \
> >    }
> >    #define TEST_ALL() \
> >    TEST_TYPE(int8_t) \
> >    TEST_ALL()
> >
> > With this patch:
> >   
> >   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
> >   ivtmp_45 = _61 * 4;
> >   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
> >   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
> >   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
> >   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
> >
> > 2. Floating-point arithmetic **WITHOUT** -ffast-math
> >   
> >    #define TEST_TYPE(TYPE) \
> >    __attribute__((noipa)) \
> >    void vadd_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n) \
> >    { \
> >      for (int i = 0; i < n; i++) \
> >        dst[i] = a[i] + b[i]; \
> >    }
> >    #define TEST_ALL() \
> >    TEST_TYPE(float) \
> >    TEST_ALL()
> >
> > With this patch:
> >    
> >   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
> >   ivtmp_45 = _61 * 4;
> >   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
> >   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
> >   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
> >   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
> >
> > With this patch, we can make sure operations won't trap for elements that "mask_out_inactive".
> >
> > gcc/ChangeLog:
> >
> >         * internal-fn.cc (FOR_EACH_CODE_MAPPING): Adapt for COND_LEN_* support.
> >         (CASE): Ditto.
> >         (get_conditional_len_internal_fn): New function.
> >         * internal-fn.h (get_conditional_len_internal_fn): Ditto.
> >         * tree-vect-stmts.cc (vectorizable_operation): Adapt for COND_LEN_* support.
> >
> > ---
> >  gcc/internal-fn.cc     | 65 ++++++++++++++++++++++++++++++------------
> >  gcc/internal-fn.h      |  1 +
> >  gcc/tree-vect-stmts.cc | 48 ++++++++++++++++++++++++-------
> >  3 files changed, 85 insertions(+), 29 deletions(-)
> >
> > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > index f9aaf66cf2a..7e3a8cc8412 100644
> > --- a/gcc/internal-fn.cc
> > +++ b/gcc/internal-fn.cc
> > @@ -4276,23 +4276,24 @@ static void (*const internal_fn_expanders[]) (internal_fn, gcall *) = {
> >    0
> >  };
> >  
> > -/* Invoke T(CODE, IFN) for each conditional function IFN that maps to a
> > -   tree code CODE.  */
> > +/* Invoke T(CODE, SUFFIX) for each conditional function IFN_COND_##SUFFIX
> > +   that maps to a tree code CODE.  There is also an IFN_COND_LEN_##SUFFIX
> > +   for each such IFN_COND_##SUFFIX.  */
> >  #define FOR_EACH_CODE_MAPPING(T) \
> > -  T (PLUS_EXPR, IFN_COND_ADD) \
> > -  T (MINUS_EXPR, IFN_COND_SUB) \
> > -  T (MULT_EXPR, IFN_COND_MUL) \
> > -  T (TRUNC_DIV_EXPR, IFN_COND_DIV) \
> > -  T (TRUNC_MOD_EXPR, IFN_COND_MOD) \
> > -  T (RDIV_EXPR, IFN_COND_RDIV) \
> > -  T (MIN_EXPR, IFN_COND_MIN) \
> > -  T (MAX_EXPR, IFN_COND_MAX) \
> > -  T (BIT_AND_EXPR, IFN_COND_AND) \
> > -  T (BIT_IOR_EXPR, IFN_COND_IOR) \
> > -  T (BIT_XOR_EXPR, IFN_COND_XOR) \
> > -  T (LSHIFT_EXPR, IFN_COND_SHL) \
> > -  T (RSHIFT_EXPR, IFN_COND_SHR) \
> > -  T (NEGATE_EXPR, IFN_COND_NEG)
> > +  T (PLUS_EXPR, ADD) \
> > +  T (MINUS_EXPR, SUB) \
> > +  T (MULT_EXPR, MUL) \
> > +  T (TRUNC_DIV_EXPR, DIV) \
> > +  T (TRUNC_MOD_EXPR, MOD) \
> > +  T (RDIV_EXPR, RDIV) \
> > +  T (MIN_EXPR, MIN) \
> > +  T (MAX_EXPR, MAX) \
> > +  T (BIT_AND_EXPR, AND) \
> > +  T (BIT_IOR_EXPR, IOR) \
> > +  T (BIT_XOR_EXPR, XOR) \
> > +  T (LSHIFT_EXPR, SHL) \
> > +  T (RSHIFT_EXPR, SHR) \
> > +  T (NEGATE_EXPR, NEG)
> >  
> >  /* Return a function that only performs CODE when a certain condition is met
> >     and that uses a given fallback value otherwise.  For example, if CODE is
> > @@ -4313,7 +4314,7 @@ get_conditional_internal_fn (tree_code code)
> >  {
> >    switch (code)
> >      {
> > -#define CASE(CODE, IFN) case CODE: return IFN;
> > +#define CASE(CODE, IFN) case CODE: return IFN_COND_##IFN;
> >        FOR_EACH_CODE_MAPPING(CASE)
> >  #undef CASE
> >      default:
> > @@ -4329,7 +4330,7 @@ conditional_internal_fn_code (internal_fn ifn)
> >  {
> >    switch (ifn)
> >      {
> > -#define CASE(CODE, IFN) case IFN: return CODE;
> > +#define CASE(CODE, IFN) case IFN_COND_##IFN: return CODE;
> >        FOR_EACH_CODE_MAPPING(CASE)
> >  #undef CASE
> >      default:
> > @@ -4337,6 +4338,34 @@ conditional_internal_fn_code (internal_fn ifn)
> >      }
> >  }
> >  
> > +/* Return a function that only performs CODE when a certain condition is met
> > +   and that uses a given fallback value otherwise.  For example, if CODE is
>  
> This describes COND_* rather than COND_LEN_*.  How about:
>  
> /* Like get_conditional_internal_fn, but return a function that
>    additionally restricts the operation to the leading elements
>    of a vector.  The number of elements to process is given by
>    a length and bias pair, as for IFN_LOAD_LEN.  The values of
>    the remaining elements are undefined.
>  
>    For example, if CODE is [...]
>  
> (Please confirm that the last sentence of the first paragraph is true.)
>  
> > +   a binary operation associated with conditional function FN:
> > +
> > +     LHS = FN (COND, A, B, ELSE, LEN, BIAS)
> > +
> > +   is equivalent to the C expression:
> > +
> > +     for (int i = 0; i < LEN + BIAS; i++)
> > +       LHS[i] = COND[i] ? A[i] CODE B[i] : ELSE[i];
> > +
> > +   operating elementwise if the operands are vectors.
> > +
>  
> The "operating elementwise" doesn't make sense in this context,
> since the example is a loop that always operates elementwise.
> Seems better to drop that line.
>  
> OK with those changes, thanks.
>  
> Richard
>  
> > +   Return IFN_LAST if no such function exists.  */
> > +
> > +internal_fn
> > +get_conditional_len_internal_fn (tree_code code)
> > +{
> > +  switch (code)
> > +    {
> > +#define CASE(CODE, IFN) case CODE: return IFN_COND_LEN_##IFN;
> > +      FOR_EACH_CODE_MAPPING(CASE)
> > +#undef CASE
> > +    default:
> > +      return IFN_LAST;
> > +    }
> > +}
> > +
> >  /* Invoke T(IFN) for each internal function IFN that also has an
> >     IFN_COND_* form.  */
> >  #define FOR_EACH_COND_FN_PAIR(T) \
> > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > index 4234bbfed87..dd1bab0bddf 100644
> > --- a/gcc/internal-fn.h
> > +++ b/gcc/internal-fn.h
> > @@ -224,6 +224,7 @@ extern bool set_edom_supported_p (void);
> >  
> >  extern internal_fn get_conditional_internal_fn (tree_code);
> >  extern internal_fn get_conditional_internal_fn (internal_fn);
> > +extern internal_fn get_conditional_len_internal_fn (tree_code);
> >  extern tree_code conditional_internal_fn_code (internal_fn);
> >  extern internal_fn get_unconditional_internal_fn (internal_fn);
> >  extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
> > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > index 10e71178ce7..dd24f017235 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -6711,7 +6711,9 @@ vectorizable_operation (vec_info *vinfo,
> >  
> >    int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
> >    vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
> > +  vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
> >    internal_fn cond_fn = get_conditional_internal_fn (code);
> > +  internal_fn cond_len_fn = get_conditional_len_internal_fn (code);
> >  
> >    /* If operating on inactive elements could generate spurious traps,
> >       we need to restrict the operation to active lanes.  Note that this
> > @@ -6730,9 +6732,17 @@ vectorizable_operation (vec_info *vinfo,
> >    && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
> >    && mask_out_inactive)
> >  {
> > -   if (cond_fn == IFN_LAST
> > -       || !direct_internal_fn_supported_p (cond_fn, vectype,
> > -   OPTIMIZE_FOR_SPEED))
> > +   if (cond_fn != IFN_LAST
> > +       && direct_internal_fn_supported_p (cond_fn, vectype,
> > + OPTIMIZE_FOR_SPEED))
> > +     vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> > +    vectype, NULL);
> > +   else if (cond_len_fn != IFN_LAST
> > +    && direct_internal_fn_supported_p (cond_len_fn, vectype,
> > +       OPTIMIZE_FOR_SPEED))
> > +     vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num, vectype,
> > +   1);
> > +   else
> >      {
> >        if (dump_enabled_p ())
> >  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > @@ -6740,9 +6750,6 @@ vectorizable_operation (vec_info *vinfo,
> >  " conditional operation is available.\n");
> >        LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> >      }
> > -   else
> > -     vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> > -    vectype, NULL);
> >  }
> >  
> >        /* Put types on constant and invariant SLP children.  */
> > @@ -6805,6 +6812,7 @@ vectorizable_operation (vec_info *vinfo,
> >                       "transform binary/unary operation.\n");
> >  
> >    bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> > +  bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
> >  
> >    /* POINTER_DIFF_EXPR has pointer arguments which are vectorized as
> >       vectors with unsigned elements, but the result is signed.  So, we
> > @@ -6971,11 +6979,16 @@ vectorizable_operation (vec_info *vinfo,
> >    gimple_assign_set_lhs (new_stmt, new_temp);
> >    vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> >  }
> > -      else if (masked_loop_p && mask_out_inactive)
> > +      else if ((masked_loop_p || len_loop_p) && mask_out_inactive)
> >  {
> > -   tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> > -   vec_num * ncopies, vectype, i);
> > -   auto_vec<tree> vops (5);
> > +   tree mask;
> > +   if (masked_loop_p)
> > +     mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> > +        vec_num * ncopies, vectype, i);
> > +   else
> > +     /* Dummy mask.  */
> > +     mask = build_minus_one_cst (truth_type_for (vectype));
> > +   auto_vec<tree> vops (6);
> >    vops.quick_push (mask);
> >    vops.quick_push (vop0);
> >    if (vop1)
> > @@ -6995,7 +7008,20 @@ vectorizable_operation (vec_info *vinfo,
> >  (cond_fn, vectype, vops.length () - 1, &vops[1]);
> >        vops.quick_push (else_value);
> >      }
> > -   gcall *call = gimple_build_call_internal_vec (cond_fn, vops);
> > +   if (len_loop_p)
> > +     {
> > +       tree len = vect_get_loop_len (loop_vinfo, gsi, lens,
> > +     vec_num * ncopies, vectype, i, 1);
> > +       signed char biasval
> > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > +       tree bias = build_int_cst (intQI_type_node, biasval);
> > +       vops.quick_push (len);
> > +       vops.quick_push (bias);
> > +     }
> > +   gcall *call
> > +     = gimple_build_call_internal_vec (masked_loop_p ? cond_fn
> > +     : cond_len_fn,
> > +       vops);
> >    new_temp = make_ssa_name (vec_dest, call);
> >    gimple_call_set_lhs (call, new_temp);
> >    gimple_call_set_nothrow (call, true);
>  
>
  
juzhe.zhong@rivai.ai July 12, 2023, 10:59 a.m. UTC | #4
Hi, Richard. 
I just notice:
+ The values of the remaining elements are undefined.

This is not true for RVV. Actually we want it to be old value that's why we have additional pass through operand.

For reduction, 

for (int i = ...)
   res += a[i]

In this case, the length generated by SELECT_VL, at the last iteration, it's probably partial vector, so the element >= length should
keep the original value.

Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-07-12 18:53
To: juzhe.zhong
CC: gcc-patches; rguenther
Subject: Re: [PATCH V2] VECT: Apply COND_LEN_* into vectorizable_operation
juzhe.zhong@rivai.ai writes:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> Hi, Richard and Richi.
> As we disscussed before, COND_LEN_* patterns were added for multiple situations.
> This patch apply CON_LEN_* for the following situation:
>
> Support for the situation that in "vectorizable_operation":
>   /* If operating on inactive elements could generate spurious traps,
>      we need to restrict the operation to active lanes.  Note that this
>      specifically doesn't apply to unhoisted invariants, since they
>      operate on the same value for every lane.
>
>      Similarly, if this operation is part of a reduction, a fully-masked
>      loop should only change the active lanes of the reduction chain,
>      keeping the inactive lanes as-is.  */
>   bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt))
>     || reduc_idx >= 0);
>
> For mask_out_inactive is true with length loop control.
>
> So, we can these 2 following cases:
>
> 1. Integer division:
>
>    #define TEST_TYPE(TYPE) \
>    __attribute__((noipa)) \
>    void vrem_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n) \
>    { \
>      for (int i = 0; i < n; i++) \
>        dst[i] = a[i] % b[i]; \
>    }
>    #define TEST_ALL() \
>    TEST_TYPE(int8_t) \
>    TEST_ALL()
>
> With this patch:
>   
>   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
>   ivtmp_45 = _61 * 4;
>   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
>   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
>   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
>   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
>
> 2. Floating-point arithmetic **WITHOUT** -ffast-math
>   
>    #define TEST_TYPE(TYPE) \
>    __attribute__((noipa)) \
>    void vadd_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n) \
>    { \
>      for (int i = 0; i < n; i++) \
>        dst[i] = a[i] + b[i]; \
>    }
>    #define TEST_ALL() \
>    TEST_TYPE(float) \
>    TEST_ALL()
>
> With this patch:
>    
>   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
>   ivtmp_45 = _61 * 4;
>   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
>   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
>   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
>   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
>
> With this patch, we can make sure operations won't trap for elements that "mask_out_inactive".
>
> gcc/ChangeLog:
>
>         * internal-fn.cc (FOR_EACH_CODE_MAPPING): Adapt for COND_LEN_* support.
>         (CASE): Ditto.
>         (get_conditional_len_internal_fn): New function.
>         * internal-fn.h (get_conditional_len_internal_fn): Ditto.
>         * tree-vect-stmts.cc (vectorizable_operation): Adapt for COND_LEN_* support.
>
> ---
>  gcc/internal-fn.cc     | 65 ++++++++++++++++++++++++++++++------------
>  gcc/internal-fn.h      |  1 +
>  gcc/tree-vect-stmts.cc | 48 ++++++++++++++++++++++++-------
>  3 files changed, 85 insertions(+), 29 deletions(-)
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index f9aaf66cf2a..7e3a8cc8412 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4276,23 +4276,24 @@ static void (*const internal_fn_expanders[]) (internal_fn, gcall *) = {
>    0
>  };
>  
> -/* Invoke T(CODE, IFN) for each conditional function IFN that maps to a
> -   tree code CODE.  */
> +/* Invoke T(CODE, SUFFIX) for each conditional function IFN_COND_##SUFFIX
> +   that maps to a tree code CODE.  There is also an IFN_COND_LEN_##SUFFIX
> +   for each such IFN_COND_##SUFFIX.  */
>  #define FOR_EACH_CODE_MAPPING(T) \
> -  T (PLUS_EXPR, IFN_COND_ADD) \
> -  T (MINUS_EXPR, IFN_COND_SUB) \
> -  T (MULT_EXPR, IFN_COND_MUL) \
> -  T (TRUNC_DIV_EXPR, IFN_COND_DIV) \
> -  T (TRUNC_MOD_EXPR, IFN_COND_MOD) \
> -  T (RDIV_EXPR, IFN_COND_RDIV) \
> -  T (MIN_EXPR, IFN_COND_MIN) \
> -  T (MAX_EXPR, IFN_COND_MAX) \
> -  T (BIT_AND_EXPR, IFN_COND_AND) \
> -  T (BIT_IOR_EXPR, IFN_COND_IOR) \
> -  T (BIT_XOR_EXPR, IFN_COND_XOR) \
> -  T (LSHIFT_EXPR, IFN_COND_SHL) \
> -  T (RSHIFT_EXPR, IFN_COND_SHR) \
> -  T (NEGATE_EXPR, IFN_COND_NEG)
> +  T (PLUS_EXPR, ADD) \
> +  T (MINUS_EXPR, SUB) \
> +  T (MULT_EXPR, MUL) \
> +  T (TRUNC_DIV_EXPR, DIV) \
> +  T (TRUNC_MOD_EXPR, MOD) \
> +  T (RDIV_EXPR, RDIV) \
> +  T (MIN_EXPR, MIN) \
> +  T (MAX_EXPR, MAX) \
> +  T (BIT_AND_EXPR, AND) \
> +  T (BIT_IOR_EXPR, IOR) \
> +  T (BIT_XOR_EXPR, XOR) \
> +  T (LSHIFT_EXPR, SHL) \
> +  T (RSHIFT_EXPR, SHR) \
> +  T (NEGATE_EXPR, NEG)
>  
>  /* Return a function that only performs CODE when a certain condition is met
>     and that uses a given fallback value otherwise.  For example, if CODE is
> @@ -4313,7 +4314,7 @@ get_conditional_internal_fn (tree_code code)
>  {
>    switch (code)
>      {
> -#define CASE(CODE, IFN) case CODE: return IFN;
> +#define CASE(CODE, IFN) case CODE: return IFN_COND_##IFN;
>        FOR_EACH_CODE_MAPPING(CASE)
>  #undef CASE
>      default:
> @@ -4329,7 +4330,7 @@ conditional_internal_fn_code (internal_fn ifn)
>  {
>    switch (ifn)
>      {
> -#define CASE(CODE, IFN) case IFN: return CODE;
> +#define CASE(CODE, IFN) case IFN_COND_##IFN: return CODE;
>        FOR_EACH_CODE_MAPPING(CASE)
>  #undef CASE
>      default:
> @@ -4337,6 +4338,34 @@ conditional_internal_fn_code (internal_fn ifn)
>      }
>  }
>  
> +/* Return a function that only performs CODE when a certain condition is met
> +   and that uses a given fallback value otherwise.  For example, if CODE is
 
This describes COND_* rather than COND_LEN_*.  How about:
 
/* Like get_conditional_internal_fn, but return a function that
   additionally restricts the operation to the leading elements
   of a vector.  The number of elements to process is given by
   a length and bias pair, as for IFN_LOAD_LEN.  The values of
   the remaining elements are undefined.
 
   For example, if CODE is [...]
 
(Please confirm that the last sentence of the first paragraph is true.)
 
> +   a binary operation associated with conditional function FN:
> +
> +     LHS = FN (COND, A, B, ELSE, LEN, BIAS)
> +
> +   is equivalent to the C expression:
> +
> +     for (int i = 0; i < LEN + BIAS; i++)
> +       LHS[i] = COND[i] ? A[i] CODE B[i] : ELSE[i];
> +
> +   operating elementwise if the operands are vectors.
> +
 
The "operating elementwise" doesn't make sense in this context,
since the example is a loop that always operates elementwise.
Seems better to drop that line.
 
OK with those changes, thanks.
 
Richard
 
> +   Return IFN_LAST if no such function exists.  */
> +
> +internal_fn
> +get_conditional_len_internal_fn (tree_code code)
> +{
> +  switch (code)
> +    {
> +#define CASE(CODE, IFN) case CODE: return IFN_COND_LEN_##IFN;
> +      FOR_EACH_CODE_MAPPING(CASE)
> +#undef CASE
> +    default:
> +      return IFN_LAST;
> +    }
> +}
> +
>  /* Invoke T(IFN) for each internal function IFN that also has an
>     IFN_COND_* form.  */
>  #define FOR_EACH_COND_FN_PAIR(T) \
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 4234bbfed87..dd1bab0bddf 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -224,6 +224,7 @@ extern bool set_edom_supported_p (void);
>  
>  extern internal_fn get_conditional_internal_fn (tree_code);
>  extern internal_fn get_conditional_internal_fn (internal_fn);
> +extern internal_fn get_conditional_len_internal_fn (tree_code);
>  extern tree_code conditional_internal_fn_code (internal_fn);
>  extern internal_fn get_unconditional_internal_fn (internal_fn);
>  extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 10e71178ce7..dd24f017235 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -6711,7 +6711,9 @@ vectorizable_operation (vec_info *vinfo,
>  
>    int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
>    vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
> +  vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
>    internal_fn cond_fn = get_conditional_internal_fn (code);
> +  internal_fn cond_len_fn = get_conditional_len_internal_fn (code);
>  
>    /* If operating on inactive elements could generate spurious traps,
>       we need to restrict the operation to active lanes.  Note that this
> @@ -6730,9 +6732,17 @@ vectorizable_operation (vec_info *vinfo,
>    && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
>    && mask_out_inactive)
>  {
> -   if (cond_fn == IFN_LAST
> -       || !direct_internal_fn_supported_p (cond_fn, vectype,
> -   OPTIMIZE_FOR_SPEED))
> +   if (cond_fn != IFN_LAST
> +       && direct_internal_fn_supported_p (cond_fn, vectype,
> + OPTIMIZE_FOR_SPEED))
> +     vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> +    vectype, NULL);
> +   else if (cond_len_fn != IFN_LAST
> +    && direct_internal_fn_supported_p (cond_len_fn, vectype,
> +       OPTIMIZE_FOR_SPEED))
> +     vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num, vectype,
> +   1);
> +   else
>      {
>        if (dump_enabled_p ())
>  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -6740,9 +6750,6 @@ vectorizable_operation (vec_info *vinfo,
>  " conditional operation is available.\n");
>        LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>      }
> -   else
> -     vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> -    vectype, NULL);
>  }
>  
>        /* Put types on constant and invariant SLP children.  */
> @@ -6805,6 +6812,7 @@ vectorizable_operation (vec_info *vinfo,
>                       "transform binary/unary operation.\n");
>  
>    bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> +  bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
>  
>    /* POINTER_DIFF_EXPR has pointer arguments which are vectorized as
>       vectors with unsigned elements, but the result is signed.  So, we
> @@ -6971,11 +6979,16 @@ vectorizable_operation (vec_info *vinfo,
>    gimple_assign_set_lhs (new_stmt, new_temp);
>    vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>  }
> -      else if (masked_loop_p && mask_out_inactive)
> +      else if ((masked_loop_p || len_loop_p) && mask_out_inactive)
>  {
> -   tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> -   vec_num * ncopies, vectype, i);
> -   auto_vec<tree> vops (5);
> +   tree mask;
> +   if (masked_loop_p)
> +     mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> +        vec_num * ncopies, vectype, i);
> +   else
> +     /* Dummy mask.  */
> +     mask = build_minus_one_cst (truth_type_for (vectype));
> +   auto_vec<tree> vops (6);
>    vops.quick_push (mask);
>    vops.quick_push (vop0);
>    if (vop1)
> @@ -6995,7 +7008,20 @@ vectorizable_operation (vec_info *vinfo,
>  (cond_fn, vectype, vops.length () - 1, &vops[1]);
>        vops.quick_push (else_value);
>      }
> -   gcall *call = gimple_build_call_internal_vec (cond_fn, vops);
> +   if (len_loop_p)
> +     {
> +       tree len = vect_get_loop_len (loop_vinfo, gsi, lens,
> +     vec_num * ncopies, vectype, i, 1);
> +       signed char biasval
> + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +       tree bias = build_int_cst (intQI_type_node, biasval);
> +       vops.quick_push (len);
> +       vops.quick_push (bias);
> +     }
> +   gcall *call
> +     = gimple_build_call_internal_vec (masked_loop_p ? cond_fn
> +     : cond_len_fn,
> +       vops);
>    new_temp = make_ssa_name (vec_dest, call);
>    gimple_call_set_lhs (call, new_temp);
>    gimple_call_set_nothrow (call, true);
  
juzhe.zhong@rivai.ai July 12, 2023, 11:04 a.m. UTC | #5
Oh. Sorry for incorrect psedudo codes that I realized.

> +     for (int i = 0; i < LEN + BIAS; i++)
> +       LHS[i] = COND[i] ? A[i] CODE B[i] : ELSE[i];


I think it should be:

 for (int i = 0; i < NUNITS; i++) {
   if (cond[i] && i < LEN + BIAS)
     LHS[i] = A[i] CODE B[i];
     else
       LHS[i] = ELSE[i];
}

Does it look reasonable to you?

Thanks.



juzhe.zhong@rivai.ai
 
From: Richard Sandiford
Date: 2023-07-12 18:53
To: juzhe.zhong
CC: gcc-patches; rguenther
Subject: Re: [PATCH V2] VECT: Apply COND_LEN_* into vectorizable_operation
juzhe.zhong@rivai.ai writes:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> Hi, Richard and Richi.
> As we disscussed before, COND_LEN_* patterns were added for multiple situations.
> This patch apply CON_LEN_* for the following situation:
>
> Support for the situation that in "vectorizable_operation":
>   /* If operating on inactive elements could generate spurious traps,
>      we need to restrict the operation to active lanes.  Note that this
>      specifically doesn't apply to unhoisted invariants, since they
>      operate on the same value for every lane.
>
>      Similarly, if this operation is part of a reduction, a fully-masked
>      loop should only change the active lanes of the reduction chain,
>      keeping the inactive lanes as-is.  */
>   bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt))
>     || reduc_idx >= 0);
>
> For mask_out_inactive is true with length loop control.
>
> So, we can these 2 following cases:
>
> 1. Integer division:
>
>    #define TEST_TYPE(TYPE) \
>    __attribute__((noipa)) \
>    void vrem_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n) \
>    { \
>      for (int i = 0; i < n; i++) \
>        dst[i] = a[i] % b[i]; \
>    }
>    #define TEST_ALL() \
>    TEST_TYPE(int8_t) \
>    TEST_ALL()
>
> With this patch:
>   
>   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
>   ivtmp_45 = _61 * 4;
>   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
>   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
>   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
>   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
>
> 2. Floating-point arithmetic **WITHOUT** -ffast-math
>   
>    #define TEST_TYPE(TYPE) \
>    __attribute__((noipa)) \
>    void vadd_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n) \
>    { \
>      for (int i = 0; i < n; i++) \
>        dst[i] = a[i] + b[i]; \
>    }
>    #define TEST_ALL() \
>    TEST_TYPE(float) \
>    TEST_ALL()
>
> With this patch:
>    
>   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
>   ivtmp_45 = _61 * 4;
>   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
>   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
>   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
>   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
>
> With this patch, we can make sure operations won't trap for elements that "mask_out_inactive".
>
> gcc/ChangeLog:
>
>         * internal-fn.cc (FOR_EACH_CODE_MAPPING): Adapt for COND_LEN_* support.
>         (CASE): Ditto.
>         (get_conditional_len_internal_fn): New function.
>         * internal-fn.h (get_conditional_len_internal_fn): Ditto.
>         * tree-vect-stmts.cc (vectorizable_operation): Adapt for COND_LEN_* support.
>
> ---
>  gcc/internal-fn.cc     | 65 ++++++++++++++++++++++++++++++------------
>  gcc/internal-fn.h      |  1 +
>  gcc/tree-vect-stmts.cc | 48 ++++++++++++++++++++++++-------
>  3 files changed, 85 insertions(+), 29 deletions(-)
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index f9aaf66cf2a..7e3a8cc8412 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4276,23 +4276,24 @@ static void (*const internal_fn_expanders[]) (internal_fn, gcall *) = {
>    0
>  };
>  
> -/* Invoke T(CODE, IFN) for each conditional function IFN that maps to a
> -   tree code CODE.  */
> +/* Invoke T(CODE, SUFFIX) for each conditional function IFN_COND_##SUFFIX
> +   that maps to a tree code CODE.  There is also an IFN_COND_LEN_##SUFFIX
> +   for each such IFN_COND_##SUFFIX.  */
>  #define FOR_EACH_CODE_MAPPING(T) \
> -  T (PLUS_EXPR, IFN_COND_ADD) \
> -  T (MINUS_EXPR, IFN_COND_SUB) \
> -  T (MULT_EXPR, IFN_COND_MUL) \
> -  T (TRUNC_DIV_EXPR, IFN_COND_DIV) \
> -  T (TRUNC_MOD_EXPR, IFN_COND_MOD) \
> -  T (RDIV_EXPR, IFN_COND_RDIV) \
> -  T (MIN_EXPR, IFN_COND_MIN) \
> -  T (MAX_EXPR, IFN_COND_MAX) \
> -  T (BIT_AND_EXPR, IFN_COND_AND) \
> -  T (BIT_IOR_EXPR, IFN_COND_IOR) \
> -  T (BIT_XOR_EXPR, IFN_COND_XOR) \
> -  T (LSHIFT_EXPR, IFN_COND_SHL) \
> -  T (RSHIFT_EXPR, IFN_COND_SHR) \
> -  T (NEGATE_EXPR, IFN_COND_NEG)
> +  T (PLUS_EXPR, ADD) \
> +  T (MINUS_EXPR, SUB) \
> +  T (MULT_EXPR, MUL) \
> +  T (TRUNC_DIV_EXPR, DIV) \
> +  T (TRUNC_MOD_EXPR, MOD) \
> +  T (RDIV_EXPR, RDIV) \
> +  T (MIN_EXPR, MIN) \
> +  T (MAX_EXPR, MAX) \
> +  T (BIT_AND_EXPR, AND) \
> +  T (BIT_IOR_EXPR, IOR) \
> +  T (BIT_XOR_EXPR, XOR) \
> +  T (LSHIFT_EXPR, SHL) \
> +  T (RSHIFT_EXPR, SHR) \
> +  T (NEGATE_EXPR, NEG)
>  
>  /* Return a function that only performs CODE when a certain condition is met
>     and that uses a given fallback value otherwise.  For example, if CODE is
> @@ -4313,7 +4314,7 @@ get_conditional_internal_fn (tree_code code)
>  {
>    switch (code)
>      {
> -#define CASE(CODE, IFN) case CODE: return IFN;
> +#define CASE(CODE, IFN) case CODE: return IFN_COND_##IFN;
>        FOR_EACH_CODE_MAPPING(CASE)
>  #undef CASE
>      default:
> @@ -4329,7 +4330,7 @@ conditional_internal_fn_code (internal_fn ifn)
>  {
>    switch (ifn)
>      {
> -#define CASE(CODE, IFN) case IFN: return CODE;
> +#define CASE(CODE, IFN) case IFN_COND_##IFN: return CODE;
>        FOR_EACH_CODE_MAPPING(CASE)
>  #undef CASE
>      default:
> @@ -4337,6 +4338,34 @@ conditional_internal_fn_code (internal_fn ifn)
>      }
>  }
>  
> +/* Return a function that only performs CODE when a certain condition is met
> +   and that uses a given fallback value otherwise.  For example, if CODE is
 
This describes COND_* rather than COND_LEN_*.  How about:
 
/* Like get_conditional_internal_fn, but return a function that
   additionally restricts the operation to the leading elements
   of a vector.  The number of elements to process is given by
   a length and bias pair, as for IFN_LOAD_LEN.  The values of
   the remaining elements are undefined.
 
   For example, if CODE is [...]
 
(Please confirm that the last sentence of the first paragraph is true.)
 
> +   a binary operation associated with conditional function FN:
> +
> +     LHS = FN (COND, A, B, ELSE, LEN, BIAS)
> +
> +   is equivalent to the C expression:
> +
> +     for (int i = 0; i < LEN + BIAS; i++)
> +       LHS[i] = COND[i] ? A[i] CODE B[i] : ELSE[i];
> +
> +   operating elementwise if the operands are vectors.
> +
 
The "operating elementwise" doesn't make sense in this context,
since the example is a loop that always operates elementwise.
Seems better to drop that line.
 
OK with those changes, thanks.
 
Richard
 
> +   Return IFN_LAST if no such function exists.  */
> +
> +internal_fn
> +get_conditional_len_internal_fn (tree_code code)
> +{
> +  switch (code)
> +    {
> +#define CASE(CODE, IFN) case CODE: return IFN_COND_LEN_##IFN;
> +      FOR_EACH_CODE_MAPPING(CASE)
> +#undef CASE
> +    default:
> +      return IFN_LAST;
> +    }
> +}
> +
>  /* Invoke T(IFN) for each internal function IFN that also has an
>     IFN_COND_* form.  */
>  #define FOR_EACH_COND_FN_PAIR(T) \
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 4234bbfed87..dd1bab0bddf 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -224,6 +224,7 @@ extern bool set_edom_supported_p (void);
>  
>  extern internal_fn get_conditional_internal_fn (tree_code);
>  extern internal_fn get_conditional_internal_fn (internal_fn);
> +extern internal_fn get_conditional_len_internal_fn (tree_code);
>  extern tree_code conditional_internal_fn_code (internal_fn);
>  extern internal_fn get_unconditional_internal_fn (internal_fn);
>  extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 10e71178ce7..dd24f017235 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -6711,7 +6711,9 @@ vectorizable_operation (vec_info *vinfo,
>  
>    int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
>    vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
> +  vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
>    internal_fn cond_fn = get_conditional_internal_fn (code);
> +  internal_fn cond_len_fn = get_conditional_len_internal_fn (code);
>  
>    /* If operating on inactive elements could generate spurious traps,
>       we need to restrict the operation to active lanes.  Note that this
> @@ -6730,9 +6732,17 @@ vectorizable_operation (vec_info *vinfo,
>    && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
>    && mask_out_inactive)
>  {
> -   if (cond_fn == IFN_LAST
> -       || !direct_internal_fn_supported_p (cond_fn, vectype,
> -   OPTIMIZE_FOR_SPEED))
> +   if (cond_fn != IFN_LAST
> +       && direct_internal_fn_supported_p (cond_fn, vectype,
> + OPTIMIZE_FOR_SPEED))
> +     vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> +    vectype, NULL);
> +   else if (cond_len_fn != IFN_LAST
> +    && direct_internal_fn_supported_p (cond_len_fn, vectype,
> +       OPTIMIZE_FOR_SPEED))
> +     vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num, vectype,
> +   1);
> +   else
>      {
>        if (dump_enabled_p ())
>  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -6740,9 +6750,6 @@ vectorizable_operation (vec_info *vinfo,
>  " conditional operation is available.\n");
>        LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>      }
> -   else
> -     vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> -    vectype, NULL);
>  }
>  
>        /* Put types on constant and invariant SLP children.  */
> @@ -6805,6 +6812,7 @@ vectorizable_operation (vec_info *vinfo,
>                       "transform binary/unary operation.\n");
>  
>    bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> +  bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
>  
>    /* POINTER_DIFF_EXPR has pointer arguments which are vectorized as
>       vectors with unsigned elements, but the result is signed.  So, we
> @@ -6971,11 +6979,16 @@ vectorizable_operation (vec_info *vinfo,
>    gimple_assign_set_lhs (new_stmt, new_temp);
>    vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>  }
> -      else if (masked_loop_p && mask_out_inactive)
> +      else if ((masked_loop_p || len_loop_p) && mask_out_inactive)
>  {
> -   tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> -   vec_num * ncopies, vectype, i);
> -   auto_vec<tree> vops (5);
> +   tree mask;
> +   if (masked_loop_p)
> +     mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> +        vec_num * ncopies, vectype, i);
> +   else
> +     /* Dummy mask.  */
> +     mask = build_minus_one_cst (truth_type_for (vectype));
> +   auto_vec<tree> vops (6);
>    vops.quick_push (mask);
>    vops.quick_push (vop0);
>    if (vop1)
> @@ -6995,7 +7008,20 @@ vectorizable_operation (vec_info *vinfo,
>  (cond_fn, vectype, vops.length () - 1, &vops[1]);
>        vops.quick_push (else_value);
>      }
> -   gcall *call = gimple_build_call_internal_vec (cond_fn, vops);
> +   if (len_loop_p)
> +     {
> +       tree len = vect_get_loop_len (loop_vinfo, gsi, lens,
> +     vec_num * ncopies, vectype, i, 1);
> +       signed char biasval
> + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +       tree bias = build_int_cst (intQI_type_node, biasval);
> +       vops.quick_push (len);
> +       vops.quick_push (bias);
> +     }
> +   gcall *call
> +     = gimple_build_call_internal_vec (masked_loop_p ? cond_fn
> +     : cond_len_fn,
> +       vops);
>    new_temp = make_ssa_name (vec_dest, call);
>    gimple_call_set_lhs (call, new_temp);
>    gimple_call_set_nothrow (call, true);
  
juzhe.zhong@rivai.ai July 12, 2023, 11:17 a.m. UTC | #6
Thanks Richard and Richi so much!

I have sent V3 to be merged after I finished bootstrap && regression.

With adding clearer comments as follows:

+/* Like get_conditional_internal_fn, but return a function that
+   additionally restricts the operation to the leading elements
+   of a vector.  The number of elements to process is given by
+   a length and bias pair.  The function only performs the CODE
+   when a certain condition is met as well as the element is located
+   within LEN + BIAS (i < LEN + BIAS) and that uses a given fallback value
+   otherwise.
+
+   For example, if CODE is [PLUS, MINUS, ... etc]:
+
+     LHS = FN (COND, A, B, ELSE, LEN, BIAS)
+
+   is equivalent to the C expression:
+
+     for (int i = 0; i < NUNITS; i++)
+      {
+	if (COND[i] && i < (LEN + BIAS))
+	  LHS[i] = A[i] CODE B[i];
+	else
+	  LHS[i] = ELSE[i];
+      }
+*/
Thanks.


juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-07-12 18:59
To: juzhe.zhong@rivai.ai
CC: richard.sandiford; gcc-patches
Subject: Re: Re: [PATCH V2] VECT: Apply COND_LEN_* into vectorizable_operation
On Wed, 12 Jul 2023, juzhe.zhong@rivai.ai wrote:
 
> Thank you so much.
> I am gonna wait for Richi's final approval.
 
It's good enough when either of us approves unless we explicitely ask
to wait for somebody else.
 
LGTM anyway.
 
Richard.
 
> Thanks.
> 
> 
> juzhe.zhong@rivai.ai
>  
> From: Richard Sandiford
> Date: 2023-07-12 18:53
> To: juzhe.zhong
> CC: gcc-patches; rguenther
> Subject: Re: [PATCH V2] VECT: Apply COND_LEN_* into vectorizable_operation
> juzhe.zhong@rivai.ai writes:
> > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> >
> > Hi, Richard and Richi.
> > As we disscussed before, COND_LEN_* patterns were added for multiple situations.
> > This patch apply CON_LEN_* for the following situation:
> >
> > Support for the situation that in "vectorizable_operation":
> >   /* If operating on inactive elements could generate spurious traps,
> >      we need to restrict the operation to active lanes.  Note that this
> >      specifically doesn't apply to unhoisted invariants, since they
> >      operate on the same value for every lane.
> >
> >      Similarly, if this operation is part of a reduction, a fully-masked
> >      loop should only change the active lanes of the reduction chain,
> >      keeping the inactive lanes as-is.  */
> >   bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt))
> >     || reduc_idx >= 0);
> >
> > For mask_out_inactive is true with length loop control.
> >
> > So, we can these 2 following cases:
> >
> > 1. Integer division:
> >
> >    #define TEST_TYPE(TYPE) \
> >    __attribute__((noipa)) \
> >    void vrem_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n) \
> >    { \
> >      for (int i = 0; i < n; i++) \
> >        dst[i] = a[i] % b[i]; \
> >    }
> >    #define TEST_ALL() \
> >    TEST_TYPE(int8_t) \
> >    TEST_ALL()
> >
> > With this patch:
> >   
> >   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
> >   ivtmp_45 = _61 * 4;
> >   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
> >   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
> >   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
> >   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
> >
> > 2. Floating-point arithmetic **WITHOUT** -ffast-math
> >   
> >    #define TEST_TYPE(TYPE) \
> >    __attribute__((noipa)) \
> >    void vadd_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n) \
> >    { \
> >      for (int i = 0; i < n; i++) \
> >        dst[i] = a[i] + b[i]; \
> >    }
> >    #define TEST_ALL() \
> >    TEST_TYPE(float) \
> >    TEST_ALL()
> >
> > With this patch:
> >    
> >   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
> >   ivtmp_45 = _61 * 4;
> >   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
> >   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
> >   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, vect__4.8_48, _61, 0);
> >   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
> >
> > With this patch, we can make sure operations won't trap for elements that "mask_out_inactive".
> >
> > gcc/ChangeLog:
> >
> >         * internal-fn.cc (FOR_EACH_CODE_MAPPING): Adapt for COND_LEN_* support.
> >         (CASE): Ditto.
> >         (get_conditional_len_internal_fn): New function.
> >         * internal-fn.h (get_conditional_len_internal_fn): Ditto.
> >         * tree-vect-stmts.cc (vectorizable_operation): Adapt for COND_LEN_* support.
> >
> > ---
> >  gcc/internal-fn.cc     | 65 ++++++++++++++++++++++++++++++------------
> >  gcc/internal-fn.h      |  1 +
> >  gcc/tree-vect-stmts.cc | 48 ++++++++++++++++++++++++-------
> >  3 files changed, 85 insertions(+), 29 deletions(-)
> >
> > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > index f9aaf66cf2a..7e3a8cc8412 100644
> > --- a/gcc/internal-fn.cc
> > +++ b/gcc/internal-fn.cc
> > @@ -4276,23 +4276,24 @@ static void (*const internal_fn_expanders[]) (internal_fn, gcall *) = {
> >    0
> >  };
> >  
> > -/* Invoke T(CODE, IFN) for each conditional function IFN that maps to a
> > -   tree code CODE.  */
> > +/* Invoke T(CODE, SUFFIX) for each conditional function IFN_COND_##SUFFIX
> > +   that maps to a tree code CODE.  There is also an IFN_COND_LEN_##SUFFIX
> > +   for each such IFN_COND_##SUFFIX.  */
> >  #define FOR_EACH_CODE_MAPPING(T) \
> > -  T (PLUS_EXPR, IFN_COND_ADD) \
> > -  T (MINUS_EXPR, IFN_COND_SUB) \
> > -  T (MULT_EXPR, IFN_COND_MUL) \
> > -  T (TRUNC_DIV_EXPR, IFN_COND_DIV) \
> > -  T (TRUNC_MOD_EXPR, IFN_COND_MOD) \
> > -  T (RDIV_EXPR, IFN_COND_RDIV) \
> > -  T (MIN_EXPR, IFN_COND_MIN) \
> > -  T (MAX_EXPR, IFN_COND_MAX) \
> > -  T (BIT_AND_EXPR, IFN_COND_AND) \
> > -  T (BIT_IOR_EXPR, IFN_COND_IOR) \
> > -  T (BIT_XOR_EXPR, IFN_COND_XOR) \
> > -  T (LSHIFT_EXPR, IFN_COND_SHL) \
> > -  T (RSHIFT_EXPR, IFN_COND_SHR) \
> > -  T (NEGATE_EXPR, IFN_COND_NEG)
> > +  T (PLUS_EXPR, ADD) \
> > +  T (MINUS_EXPR, SUB) \
> > +  T (MULT_EXPR, MUL) \
> > +  T (TRUNC_DIV_EXPR, DIV) \
> > +  T (TRUNC_MOD_EXPR, MOD) \
> > +  T (RDIV_EXPR, RDIV) \
> > +  T (MIN_EXPR, MIN) \
> > +  T (MAX_EXPR, MAX) \
> > +  T (BIT_AND_EXPR, AND) \
> > +  T (BIT_IOR_EXPR, IOR) \
> > +  T (BIT_XOR_EXPR, XOR) \
> > +  T (LSHIFT_EXPR, SHL) \
> > +  T (RSHIFT_EXPR, SHR) \
> > +  T (NEGATE_EXPR, NEG)
> >  
> >  /* Return a function that only performs CODE when a certain condition is met
> >     and that uses a given fallback value otherwise.  For example, if CODE is
> > @@ -4313,7 +4314,7 @@ get_conditional_internal_fn (tree_code code)
> >  {
> >    switch (code)
> >      {
> > -#define CASE(CODE, IFN) case CODE: return IFN;
> > +#define CASE(CODE, IFN) case CODE: return IFN_COND_##IFN;
> >        FOR_EACH_CODE_MAPPING(CASE)
> >  #undef CASE
> >      default:
> > @@ -4329,7 +4330,7 @@ conditional_internal_fn_code (internal_fn ifn)
> >  {
> >    switch (ifn)
> >      {
> > -#define CASE(CODE, IFN) case IFN: return CODE;
> > +#define CASE(CODE, IFN) case IFN_COND_##IFN: return CODE;
> >        FOR_EACH_CODE_MAPPING(CASE)
> >  #undef CASE
> >      default:
> > @@ -4337,6 +4338,34 @@ conditional_internal_fn_code (internal_fn ifn)
> >      }
> >  }
> >  
> > +/* Return a function that only performs CODE when a certain condition is met
> > +   and that uses a given fallback value otherwise.  For example, if CODE is
>  
> This describes COND_* rather than COND_LEN_*.  How about:
>  
> /* Like get_conditional_internal_fn, but return a function that
>    additionally restricts the operation to the leading elements
>    of a vector.  The number of elements to process is given by
>    a length and bias pair, as for IFN_LOAD_LEN.  The values of
>    the remaining elements are undefined.
>  
>    For example, if CODE is [...]
>  
> (Please confirm that the last sentence of the first paragraph is true.)
>  
> > +   a binary operation associated with conditional function FN:
> > +
> > +     LHS = FN (COND, A, B, ELSE, LEN, BIAS)
> > +
> > +   is equivalent to the C expression:
> > +
> > +     for (int i = 0; i < LEN + BIAS; i++)
> > +       LHS[i] = COND[i] ? A[i] CODE B[i] : ELSE[i];
> > +
> > +   operating elementwise if the operands are vectors.
> > +
>  
> The "operating elementwise" doesn't make sense in this context,
> since the example is a loop that always operates elementwise.
> Seems better to drop that line.
>  
> OK with those changes, thanks.
>  
> Richard
>  
> > +   Return IFN_LAST if no such function exists.  */
> > +
> > +internal_fn
> > +get_conditional_len_internal_fn (tree_code code)
> > +{
> > +  switch (code)
> > +    {
> > +#define CASE(CODE, IFN) case CODE: return IFN_COND_LEN_##IFN;
> > +      FOR_EACH_CODE_MAPPING(CASE)
> > +#undef CASE
> > +    default:
> > +      return IFN_LAST;
> > +    }
> > +}
> > +
> >  /* Invoke T(IFN) for each internal function IFN that also has an
> >     IFN_COND_* form.  */
> >  #define FOR_EACH_COND_FN_PAIR(T) \
> > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > index 4234bbfed87..dd1bab0bddf 100644
> > --- a/gcc/internal-fn.h
> > +++ b/gcc/internal-fn.h
> > @@ -224,6 +224,7 @@ extern bool set_edom_supported_p (void);
> >  
> >  extern internal_fn get_conditional_internal_fn (tree_code);
> >  extern internal_fn get_conditional_internal_fn (internal_fn);
> > +extern internal_fn get_conditional_len_internal_fn (tree_code);
> >  extern tree_code conditional_internal_fn_code (internal_fn);
> >  extern internal_fn get_unconditional_internal_fn (internal_fn);
> >  extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
> > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > index 10e71178ce7..dd24f017235 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -6711,7 +6711,9 @@ vectorizable_operation (vec_info *vinfo,
> >  
> >    int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
> >    vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
> > +  vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
> >    internal_fn cond_fn = get_conditional_internal_fn (code);
> > +  internal_fn cond_len_fn = get_conditional_len_internal_fn (code);
> >  
> >    /* If operating on inactive elements could generate spurious traps,
> >       we need to restrict the operation to active lanes.  Note that this
> > @@ -6730,9 +6732,17 @@ vectorizable_operation (vec_info *vinfo,
> >    && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
> >    && mask_out_inactive)
> >  {
> > -   if (cond_fn == IFN_LAST
> > -       || !direct_internal_fn_supported_p (cond_fn, vectype,
> > -   OPTIMIZE_FOR_SPEED))
> > +   if (cond_fn != IFN_LAST
> > +       && direct_internal_fn_supported_p (cond_fn, vectype,
> > + OPTIMIZE_FOR_SPEED))
> > +     vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> > +    vectype, NULL);
> > +   else if (cond_len_fn != IFN_LAST
> > +    && direct_internal_fn_supported_p (cond_len_fn, vectype,
> > +       OPTIMIZE_FOR_SPEED))
> > +     vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num, vectype,
> > +   1);
> > +   else
> >      {
> >        if (dump_enabled_p ())
> >  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > @@ -6740,9 +6750,6 @@ vectorizable_operation (vec_info *vinfo,
> >  " conditional operation is available.\n");
> >        LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> >      }
> > -   else
> > -     vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> > -    vectype, NULL);
> >  }
> >  
> >        /* Put types on constant and invariant SLP children.  */
> > @@ -6805,6 +6812,7 @@ vectorizable_operation (vec_info *vinfo,
> >                       "transform binary/unary operation.\n");
> >  
> >    bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> > +  bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
> >  
> >    /* POINTER_DIFF_EXPR has pointer arguments which are vectorized as
> >       vectors with unsigned elements, but the result is signed.  So, we
> > @@ -6971,11 +6979,16 @@ vectorizable_operation (vec_info *vinfo,
> >    gimple_assign_set_lhs (new_stmt, new_temp);
> >    vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> >  }
> > -      else if (masked_loop_p && mask_out_inactive)
> > +      else if ((masked_loop_p || len_loop_p) && mask_out_inactive)
> >  {
> > -   tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> > -   vec_num * ncopies, vectype, i);
> > -   auto_vec<tree> vops (5);
> > +   tree mask;
> > +   if (masked_loop_p)
> > +     mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> > +        vec_num * ncopies, vectype, i);
> > +   else
> > +     /* Dummy mask.  */
> > +     mask = build_minus_one_cst (truth_type_for (vectype));
> > +   auto_vec<tree> vops (6);
> >    vops.quick_push (mask);
> >    vops.quick_push (vop0);
> >    if (vop1)
> > @@ -6995,7 +7008,20 @@ vectorizable_operation (vec_info *vinfo,
> >  (cond_fn, vectype, vops.length () - 1, &vops[1]);
> >        vops.quick_push (else_value);
> >      }
> > -   gcall *call = gimple_build_call_internal_vec (cond_fn, vops);
> > +   if (len_loop_p)
> > +     {
> > +       tree len = vect_get_loop_len (loop_vinfo, gsi, lens,
> > +     vec_num * ncopies, vectype, i, 1);
> > +       signed char biasval
> > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > +       tree bias = build_int_cst (intQI_type_node, biasval);
> > +       vops.quick_push (len);
> > +       vops.quick_push (bias);
> > +     }
> > +   gcall *call
> > +     = gimple_build_call_internal_vec (masked_loop_p ? cond_fn
> > +     : cond_len_fn,
> > +       vops);
> >    new_temp = make_ssa_name (vec_dest, call);
> >    gimple_call_set_lhs (call, new_temp);
> >    gimple_call_set_nothrow (call, true);
>  
> 
 
-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)
  

Patch

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index f9aaf66cf2a..7e3a8cc8412 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -4276,23 +4276,24 @@  static void (*const internal_fn_expanders[]) (internal_fn, gcall *) = {
   0
 };
 
-/* Invoke T(CODE, IFN) for each conditional function IFN that maps to a
-   tree code CODE.  */
+/* Invoke T(CODE, SUFFIX) for each conditional function IFN_COND_##SUFFIX
+   that maps to a tree code CODE.  There is also an IFN_COND_LEN_##SUFFIX
+   for each such IFN_COND_##SUFFIX.  */
 #define FOR_EACH_CODE_MAPPING(T) \
-  T (PLUS_EXPR, IFN_COND_ADD) \
-  T (MINUS_EXPR, IFN_COND_SUB) \
-  T (MULT_EXPR, IFN_COND_MUL) \
-  T (TRUNC_DIV_EXPR, IFN_COND_DIV) \
-  T (TRUNC_MOD_EXPR, IFN_COND_MOD) \
-  T (RDIV_EXPR, IFN_COND_RDIV) \
-  T (MIN_EXPR, IFN_COND_MIN) \
-  T (MAX_EXPR, IFN_COND_MAX) \
-  T (BIT_AND_EXPR, IFN_COND_AND) \
-  T (BIT_IOR_EXPR, IFN_COND_IOR) \
-  T (BIT_XOR_EXPR, IFN_COND_XOR) \
-  T (LSHIFT_EXPR, IFN_COND_SHL) \
-  T (RSHIFT_EXPR, IFN_COND_SHR) \
-  T (NEGATE_EXPR, IFN_COND_NEG)
+  T (PLUS_EXPR, ADD) \
+  T (MINUS_EXPR, SUB) \
+  T (MULT_EXPR, MUL) \
+  T (TRUNC_DIV_EXPR, DIV) \
+  T (TRUNC_MOD_EXPR, MOD) \
+  T (RDIV_EXPR, RDIV) \
+  T (MIN_EXPR, MIN) \
+  T (MAX_EXPR, MAX) \
+  T (BIT_AND_EXPR, AND) \
+  T (BIT_IOR_EXPR, IOR) \
+  T (BIT_XOR_EXPR, XOR) \
+  T (LSHIFT_EXPR, SHL) \
+  T (RSHIFT_EXPR, SHR) \
+  T (NEGATE_EXPR, NEG)
 
 /* Return a function that only performs CODE when a certain condition is met
    and that uses a given fallback value otherwise.  For example, if CODE is
@@ -4313,7 +4314,7 @@  get_conditional_internal_fn (tree_code code)
 {
   switch (code)
     {
-#define CASE(CODE, IFN) case CODE: return IFN;
+#define CASE(CODE, IFN) case CODE: return IFN_COND_##IFN;
       FOR_EACH_CODE_MAPPING(CASE)
 #undef CASE
     default:
@@ -4329,7 +4330,7 @@  conditional_internal_fn_code (internal_fn ifn)
 {
   switch (ifn)
     {
-#define CASE(CODE, IFN) case IFN: return CODE;
+#define CASE(CODE, IFN) case IFN_COND_##IFN: return CODE;
       FOR_EACH_CODE_MAPPING(CASE)
 #undef CASE
     default:
@@ -4337,6 +4338,34 @@  conditional_internal_fn_code (internal_fn ifn)
     }
 }
 
+/* Return a function that only performs CODE when a certain condition is met
+   and that uses a given fallback value otherwise.  For example, if CODE is
+   a binary operation associated with conditional function FN:
+
+     LHS = FN (COND, A, B, ELSE, LEN, BIAS)
+
+   is equivalent to the C expression:
+
+     for (int i = 0; i < LEN + BIAS; i++)
+       LHS[i] = COND[i] ? A[i] CODE B[i] : ELSE[i];
+
+   operating elementwise if the operands are vectors.
+
+   Return IFN_LAST if no such function exists.  */
+
+internal_fn
+get_conditional_len_internal_fn (tree_code code)
+{
+  switch (code)
+    {
+#define CASE(CODE, IFN) case CODE: return IFN_COND_LEN_##IFN;
+      FOR_EACH_CODE_MAPPING(CASE)
+#undef CASE
+    default:
+      return IFN_LAST;
+    }
+}
+
 /* Invoke T(IFN) for each internal function IFN that also has an
    IFN_COND_* form.  */
 #define FOR_EACH_COND_FN_PAIR(T) \
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index 4234bbfed87..dd1bab0bddf 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -224,6 +224,7 @@  extern bool set_edom_supported_p (void);
 
 extern internal_fn get_conditional_internal_fn (tree_code);
 extern internal_fn get_conditional_internal_fn (internal_fn);
+extern internal_fn get_conditional_len_internal_fn (tree_code);
 extern tree_code conditional_internal_fn_code (internal_fn);
 extern internal_fn get_unconditional_internal_fn (internal_fn);
 extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 10e71178ce7..dd24f017235 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -6711,7 +6711,9 @@  vectorizable_operation (vec_info *vinfo,
 
   int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
   vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
+  vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
   internal_fn cond_fn = get_conditional_internal_fn (code);
+  internal_fn cond_len_fn = get_conditional_len_internal_fn (code);
 
   /* If operating on inactive elements could generate spurious traps,
      we need to restrict the operation to active lanes.  Note that this
@@ -6730,9 +6732,17 @@  vectorizable_operation (vec_info *vinfo,
 	  && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
 	  && mask_out_inactive)
 	{
-	  if (cond_fn == IFN_LAST
-	      || !direct_internal_fn_supported_p (cond_fn, vectype,
-						  OPTIMIZE_FOR_SPEED))
+	  if (cond_fn != IFN_LAST
+	      && direct_internal_fn_supported_p (cond_fn, vectype,
+						 OPTIMIZE_FOR_SPEED))
+	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
+				   vectype, NULL);
+	  else if (cond_len_fn != IFN_LAST
+		   && direct_internal_fn_supported_p (cond_len_fn, vectype,
+						      OPTIMIZE_FOR_SPEED))
+	    vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num, vectype,
+				  1);
+	  else
 	    {
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -6740,9 +6750,6 @@  vectorizable_operation (vec_info *vinfo,
 				 " conditional operation is available.\n");
 	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
 	    }
-	  else
-	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
-				   vectype, NULL);
 	}
 
       /* Put types on constant and invariant SLP children.  */
@@ -6805,6 +6812,7 @@  vectorizable_operation (vec_info *vinfo,
                      "transform binary/unary operation.\n");
 
   bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
+  bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
 
   /* POINTER_DIFF_EXPR has pointer arguments which are vectorized as
      vectors with unsigned elements, but the result is signed.  So, we
@@ -6971,11 +6979,16 @@  vectorizable_operation (vec_info *vinfo,
 	  gimple_assign_set_lhs (new_stmt, new_temp);
 	  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
 	}
-      else if (masked_loop_p && mask_out_inactive)
+      else if ((masked_loop_p || len_loop_p) && mask_out_inactive)
 	{
-	  tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
-					  vec_num * ncopies, vectype, i);
-	  auto_vec<tree> vops (5);
+	  tree mask;
+	  if (masked_loop_p)
+	    mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
+				       vec_num * ncopies, vectype, i);
+	  else
+	    /* Dummy mask.  */
+	    mask = build_minus_one_cst (truth_type_for (vectype));
+	  auto_vec<tree> vops (6);
 	  vops.quick_push (mask);
 	  vops.quick_push (vop0);
 	  if (vop1)
@@ -6995,7 +7008,20 @@  vectorizable_operation (vec_info *vinfo,
 		(cond_fn, vectype, vops.length () - 1, &vops[1]);
 	      vops.quick_push (else_value);
 	    }
-	  gcall *call = gimple_build_call_internal_vec (cond_fn, vops);
+	  if (len_loop_p)
+	    {
+	      tree len = vect_get_loop_len (loop_vinfo, gsi, lens,
+					    vec_num * ncopies, vectype, i, 1);
+	      signed char biasval
+		= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+	      tree bias = build_int_cst (intQI_type_node, biasval);
+	      vops.quick_push (len);
+	      vops.quick_push (bias);
+	    }
+	  gcall *call
+	    = gimple_build_call_internal_vec (masked_loop_p ? cond_fn
+							    : cond_len_fn,
+					      vops);
 	  new_temp = make_ssa_name (vec_dest, call);
 	  gimple_call_set_lhs (call, new_temp);
 	  gimple_call_set_nothrow (call, true);