VECT: Apply LEN_FOLD_EXTRACT_LAST into loop vectorizer

Message ID 20230824020836.48335-1-juzhe.zhong@rivai.ai
State Accepted
Headers
Series VECT: Apply LEN_FOLD_EXTRACT_LAST into loop vectorizer |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

juzhe.zhong@rivai.ai Aug. 24, 2023, 2:08 a.m. UTC
  Hi.

This patch is apply LEN_FOLD_EXTRACT_LAST into loop vectorizer.

Consider this following case:
#include <stdint.h>

#define N 32

/* Simple condition reduction.  */

int __attribute__ ((noinline, noclone))
condition_reduction (int *a, int min_v)
{
  int last = 66; /* High start value.  */

  for (int i = 0; i < N; i++)
    if (a[i] < min_v)
      last = i;

  return last;
}

With this patch, we can generate this following IR:

  _44 = .SELECT_VL (ivtmp_42, POLY_INT_CST [4, 4]);
  _34 = vect_vec_iv_.5_33 + { POLY_INT_CST [4, 4], ... };
  ivtmp_36 = _44 * 4;
  vect__4.8_39 = .MASK_LEN_LOAD (vectp_a.6_37, 32B, { -1, ... }, _44, 0);

  mask__11.9_41 = vect__4.8_39 < vect_cst__40;
  last_5 = .LEN_FOLD_EXTRACT_LAST (last_14, mask__11.9_41, vect_vec_iv_.5_33, _44, 0);
  ...

gcc/ChangeLog:

        * tree-vect-loop.cc (vectorizable_reduction): Apply LEN_FOLD_EXTRACT_LAST.
        * tree-vect-stmts.cc (vectorizable_condition): Ditto.

---
 gcc/tree-vect-loop.cc  |  7 ++++--
 gcc/tree-vect-stmts.cc | 52 ++++++++++++++++++++++++++++++++++++------
 2 files changed, 50 insertions(+), 9 deletions(-)
  

Comments

Richard Biener Aug. 24, 2023, 6:39 a.m. UTC | #1
On Thu, 24 Aug 2023, Juzhe-Zhong wrote:

> Hi.
> 
> This patch is apply LEN_FOLD_EXTRACT_LAST into loop vectorizer.
> 
> Consider this following case:
> #include <stdint.h>
> 
> #define N 32
> 
> /* Simple condition reduction.  */
> 
> int __attribute__ ((noinline, noclone))
> condition_reduction (int *a, int min_v)
> {
>   int last = 66; /* High start value.  */
> 
>   for (int i = 0; i < N; i++)
>     if (a[i] < min_v)
>       last = i;
> 
>   return last;
> }
> 
> With this patch, we can generate this following IR:
> 
>   _44 = .SELECT_VL (ivtmp_42, POLY_INT_CST [4, 4]);
>   _34 = vect_vec_iv_.5_33 + { POLY_INT_CST [4, 4], ... };
>   ivtmp_36 = _44 * 4;
>   vect__4.8_39 = .MASK_LEN_LOAD (vectp_a.6_37, 32B, { -1, ... }, _44, 0);
> 
>   mask__11.9_41 = vect__4.8_39 < vect_cst__40;
>   last_5 = .LEN_FOLD_EXTRACT_LAST (last_14, mask__11.9_41, vect_vec_iv_.5_33, _44, 0);
>   ...

LGTM.

Thanks,
Richard.

> gcc/ChangeLog:
> 
>         * tree-vect-loop.cc (vectorizable_reduction): Apply LEN_FOLD_EXTRACT_LAST.
>         * tree-vect-stmts.cc (vectorizable_condition): Ditto.
> 
> ---
>  gcc/tree-vect-loop.cc  |  7 ++++--
>  gcc/tree-vect-stmts.cc | 52 ++++++++++++++++++++++++++++++++++++------
>  2 files changed, 50 insertions(+), 9 deletions(-)
> 
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 1cd6c291377..ebee8037e02 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -7494,8 +7494,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>  	}
>  
>        if (reduc_chain_length == 1
> -	  && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
> -					     vectype_in, OPTIMIZE_FOR_SPEED))
> +	  && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
> +					      OPTIMIZE_FOR_SPEED)
> +	      || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
> +						 vectype_in,
> +						 OPTIMIZE_FOR_SPEED)))
>  	{
>  	  if (dump_enabled_p ())
>  	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 413a88750d6..be9f3a280bd 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -11740,8 +11740,17 @@ vectorizable_condition (vec_info *vinfo,
>  	  && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
>  	{
>  	  if (reduction_type == EXTRACT_LAST_REDUCTION)
> -	    vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo),
> -				   ncopies * vec_num, vectype, NULL);
> +	    {
> +	      if (direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
> +						  vectype, OPTIMIZE_FOR_SPEED))
> +		vect_record_loop_len (loop_vinfo,
> +				      &LOOP_VINFO_LENS (loop_vinfo),
> +				      ncopies * vec_num, vectype, 1);
> +	      else
> +		vect_record_loop_mask (loop_vinfo,
> +				       &LOOP_VINFO_MASKS (loop_vinfo),
> +				       ncopies * vec_num, vectype, NULL);
> +	    }
>  	  /* Extra inactive lanes should be safe for vect_nested_cycle.  */
>  	  else if (STMT_VINFO_DEF_TYPE (reduc_info) != vect_nested_cycle)
>  	    {
> @@ -11772,7 +11781,13 @@ vectorizable_condition (vec_info *vinfo,
>       mask to the condition, or to its inverse.  */
>  
>    vec_loop_masks *masks = NULL;
> -  if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> +  vec_loop_lens *lens = NULL;
> +  if (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> +    {
> +      if (reduction_type == EXTRACT_LAST_REDUCTION)
> +	lens = &LOOP_VINFO_LENS (loop_vinfo);
> +    }
> +  else if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>      {
>        if (reduction_type == EXTRACT_LAST_REDUCTION)
>  	masks = &LOOP_VINFO_MASKS (loop_vinfo);
> @@ -11910,7 +11925,8 @@ vectorizable_condition (vec_info *vinfo,
>        /* Force vec_compare to be an SSA_NAME rather than a comparison,
>  	 in cases where that's necessary.  */
>  
> -      if (masks || reduction_type == EXTRACT_LAST_REDUCTION)
> +      tree len = NULL_TREE, bias = NULL_TREE;
> +      if (masks || lens || reduction_type == EXTRACT_LAST_REDUCTION)
>  	{
>  	  if (!is_gimple_val (vec_compare))
>  	    {
> @@ -11931,6 +11947,23 @@ vectorizable_condition (vec_info *vinfo,
>  	      vec_compare = vec_compare_name;
>  	    }
>  
> +	  if (direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
> +					      vectype, OPTIMIZE_FOR_SPEED))
> +	    {
> +	      if (lens)
> +		{
> +		  len = vect_get_loop_len (loop_vinfo, gsi, lens,
> +					   vec_num * ncopies, vectype, i, 1);
> +		  signed char biasval
> +		    = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +		  bias = build_int_cst (intQI_type_node, biasval);
> +		}
> +	      else
> +		{
> +		  len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> +		  bias = build_int_cst (intQI_type_node, 0);
> +		}
> +	    }
>  	  if (masks)
>  	    {
>  	      tree loop_mask
> @@ -11950,9 +11983,14 @@ vectorizable_condition (vec_info *vinfo,
>  	{
>  	  gimple *old_stmt = vect_orig_stmt (stmt_info)->stmt;
>  	  tree lhs = gimple_get_lhs (old_stmt);
> -	  new_stmt = gimple_build_call_internal
> -	      (IFN_FOLD_EXTRACT_LAST, 3, else_clause, vec_compare,
> -	       vec_then_clause);
> +	  if (len)
> +	    new_stmt = gimple_build_call_internal
> +	        (IFN_LEN_FOLD_EXTRACT_LAST, 5, else_clause, vec_compare,
> +	         vec_then_clause, len, bias);
> +	  else
> +	    new_stmt = gimple_build_call_internal
> +	        (IFN_FOLD_EXTRACT_LAST, 3, else_clause, vec_compare,
> +	         vec_then_clause);
>  	  gimple_call_set_lhs (new_stmt, lhs);
>  	  SSA_NAME_DEF_STMT (lhs) = new_stmt;
>  	  if (old_stmt == gsi_stmt (*gsi))
>
  
Li, Pan2 via Gcc-patches Aug. 24, 2023, 9:03 a.m. UTC | #2
Committed, thanks Richard.

Pan

-----Original Message-----
From: Gcc-patches <gcc-patches-bounces+pan2.li=intel.com@gcc.gnu.org> On Behalf Of Richard Biener via Gcc-patches
Sent: Thursday, August 24, 2023 2:39 PM
To: Juzhe-Zhong <juzhe.zhong@rivai.ai>
Cc: gcc-patches@gcc.gnu.org; richard.sandiford@arm.com
Subject: Re: [PATCH] VECT: Apply LEN_FOLD_EXTRACT_LAST into loop vectorizer

On Thu, 24 Aug 2023, Juzhe-Zhong wrote:

> Hi.
> 
> This patch is apply LEN_FOLD_EXTRACT_LAST into loop vectorizer.
> 
> Consider this following case:
> #include <stdint.h>
> 
> #define N 32
> 
> /* Simple condition reduction.  */
> 
> int __attribute__ ((noinline, noclone))
> condition_reduction (int *a, int min_v)
> {
>   int last = 66; /* High start value.  */
> 
>   for (int i = 0; i < N; i++)
>     if (a[i] < min_v)
>       last = i;
> 
>   return last;
> }
> 
> With this patch, we can generate this following IR:
> 
>   _44 = .SELECT_VL (ivtmp_42, POLY_INT_CST [4, 4]);
>   _34 = vect_vec_iv_.5_33 + { POLY_INT_CST [4, 4], ... };
>   ivtmp_36 = _44 * 4;
>   vect__4.8_39 = .MASK_LEN_LOAD (vectp_a.6_37, 32B, { -1, ... }, _44, 0);
> 
>   mask__11.9_41 = vect__4.8_39 < vect_cst__40;
>   last_5 = .LEN_FOLD_EXTRACT_LAST (last_14, mask__11.9_41, vect_vec_iv_.5_33, _44, 0);
>   ...

LGTM.

Thanks,
Richard.

> gcc/ChangeLog:
> 
>         * tree-vect-loop.cc (vectorizable_reduction): Apply LEN_FOLD_EXTRACT_LAST.
>         * tree-vect-stmts.cc (vectorizable_condition): Ditto.
> 
> ---
>  gcc/tree-vect-loop.cc  |  7 ++++--
>  gcc/tree-vect-stmts.cc | 52 ++++++++++++++++++++++++++++++++++++------
>  2 files changed, 50 insertions(+), 9 deletions(-)
> 
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 1cd6c291377..ebee8037e02 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -7494,8 +7494,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>  	}
>  
>        if (reduc_chain_length == 1
> -	  && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
> -					     vectype_in, OPTIMIZE_FOR_SPEED))
> +	  && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
> +					      OPTIMIZE_FOR_SPEED)
> +	      || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
> +						 vectype_in,
> +						 OPTIMIZE_FOR_SPEED)))
>  	{
>  	  if (dump_enabled_p ())
>  	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 413a88750d6..be9f3a280bd 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -11740,8 +11740,17 @@ vectorizable_condition (vec_info *vinfo,
>  	  && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
>  	{
>  	  if (reduction_type == EXTRACT_LAST_REDUCTION)
> -	    vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo),
> -				   ncopies * vec_num, vectype, NULL);
> +	    {
> +	      if (direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
> +						  vectype, OPTIMIZE_FOR_SPEED))
> +		vect_record_loop_len (loop_vinfo,
> +				      &LOOP_VINFO_LENS (loop_vinfo),
> +				      ncopies * vec_num, vectype, 1);
> +	      else
> +		vect_record_loop_mask (loop_vinfo,
> +				       &LOOP_VINFO_MASKS (loop_vinfo),
> +				       ncopies * vec_num, vectype, NULL);
> +	    }
>  	  /* Extra inactive lanes should be safe for vect_nested_cycle.  */
>  	  else if (STMT_VINFO_DEF_TYPE (reduc_info) != vect_nested_cycle)
>  	    {
> @@ -11772,7 +11781,13 @@ vectorizable_condition (vec_info *vinfo,
>       mask to the condition, or to its inverse.  */
>  
>    vec_loop_masks *masks = NULL;
> -  if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> +  vec_loop_lens *lens = NULL;
> +  if (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> +    {
> +      if (reduction_type == EXTRACT_LAST_REDUCTION)
> +	lens = &LOOP_VINFO_LENS (loop_vinfo);
> +    }
> +  else if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>      {
>        if (reduction_type == EXTRACT_LAST_REDUCTION)
>  	masks = &LOOP_VINFO_MASKS (loop_vinfo);
> @@ -11910,7 +11925,8 @@ vectorizable_condition (vec_info *vinfo,
>        /* Force vec_compare to be an SSA_NAME rather than a comparison,
>  	 in cases where that's necessary.  */
>  
> -      if (masks || reduction_type == EXTRACT_LAST_REDUCTION)
> +      tree len = NULL_TREE, bias = NULL_TREE;
> +      if (masks || lens || reduction_type == EXTRACT_LAST_REDUCTION)
>  	{
>  	  if (!is_gimple_val (vec_compare))
>  	    {
> @@ -11931,6 +11947,23 @@ vectorizable_condition (vec_info *vinfo,
>  	      vec_compare = vec_compare_name;
>  	    }
>  
> +	  if (direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
> +					      vectype, OPTIMIZE_FOR_SPEED))
> +	    {
> +	      if (lens)
> +		{
> +		  len = vect_get_loop_len (loop_vinfo, gsi, lens,
> +					   vec_num * ncopies, vectype, i, 1);
> +		  signed char biasval
> +		    = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +		  bias = build_int_cst (intQI_type_node, biasval);
> +		}
> +	      else
> +		{
> +		  len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> +		  bias = build_int_cst (intQI_type_node, 0);
> +		}
> +	    }
>  	  if (masks)
>  	    {
>  	      tree loop_mask
> @@ -11950,9 +11983,14 @@ vectorizable_condition (vec_info *vinfo,
>  	{
>  	  gimple *old_stmt = vect_orig_stmt (stmt_info)->stmt;
>  	  tree lhs = gimple_get_lhs (old_stmt);
> -	  new_stmt = gimple_build_call_internal
> -	      (IFN_FOLD_EXTRACT_LAST, 3, else_clause, vec_compare,
> -	       vec_then_clause);
> +	  if (len)
> +	    new_stmt = gimple_build_call_internal
> +	        (IFN_LEN_FOLD_EXTRACT_LAST, 5, else_clause, vec_compare,
> +	         vec_then_clause, len, bias);
> +	  else
> +	    new_stmt = gimple_build_call_internal
> +	        (IFN_FOLD_EXTRACT_LAST, 3, else_clause, vec_compare,
> +	         vec_then_clause);
>  	  gimple_call_set_lhs (new_stmt, lhs);
>  	  SSA_NAME_DEF_STMT (lhs) = new_stmt;
>  	  if (old_stmt == gsi_stmt (*gsi))
>
  

Patch

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 1cd6c291377..ebee8037e02 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -7494,8 +7494,11 @@  vectorizable_reduction (loop_vec_info loop_vinfo,
 	}
 
       if (reduc_chain_length == 1
-	  && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
-					     vectype_in, OPTIMIZE_FOR_SPEED))
+	  && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
+					      OPTIMIZE_FOR_SPEED)
+	      || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
+						 vectype_in,
+						 OPTIMIZE_FOR_SPEED)))
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 413a88750d6..be9f3a280bd 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -11740,8 +11740,17 @@  vectorizable_condition (vec_info *vinfo,
 	  && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
 	{
 	  if (reduction_type == EXTRACT_LAST_REDUCTION)
-	    vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo),
-				   ncopies * vec_num, vectype, NULL);
+	    {
+	      if (direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
+						  vectype, OPTIMIZE_FOR_SPEED))
+		vect_record_loop_len (loop_vinfo,
+				      &LOOP_VINFO_LENS (loop_vinfo),
+				      ncopies * vec_num, vectype, 1);
+	      else
+		vect_record_loop_mask (loop_vinfo,
+				       &LOOP_VINFO_MASKS (loop_vinfo),
+				       ncopies * vec_num, vectype, NULL);
+	    }
 	  /* Extra inactive lanes should be safe for vect_nested_cycle.  */
 	  else if (STMT_VINFO_DEF_TYPE (reduc_info) != vect_nested_cycle)
 	    {
@@ -11772,7 +11781,13 @@  vectorizable_condition (vec_info *vinfo,
      mask to the condition, or to its inverse.  */
 
   vec_loop_masks *masks = NULL;
-  if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  vec_loop_lens *lens = NULL;
+  if (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+    {
+      if (reduction_type == EXTRACT_LAST_REDUCTION)
+	lens = &LOOP_VINFO_LENS (loop_vinfo);
+    }
+  else if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
     {
       if (reduction_type == EXTRACT_LAST_REDUCTION)
 	masks = &LOOP_VINFO_MASKS (loop_vinfo);
@@ -11910,7 +11925,8 @@  vectorizable_condition (vec_info *vinfo,
       /* Force vec_compare to be an SSA_NAME rather than a comparison,
 	 in cases where that's necessary.  */
 
-      if (masks || reduction_type == EXTRACT_LAST_REDUCTION)
+      tree len = NULL_TREE, bias = NULL_TREE;
+      if (masks || lens || reduction_type == EXTRACT_LAST_REDUCTION)
 	{
 	  if (!is_gimple_val (vec_compare))
 	    {
@@ -11931,6 +11947,23 @@  vectorizable_condition (vec_info *vinfo,
 	      vec_compare = vec_compare_name;
 	    }
 
+	  if (direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
+					      vectype, OPTIMIZE_FOR_SPEED))
+	    {
+	      if (lens)
+		{
+		  len = vect_get_loop_len (loop_vinfo, gsi, lens,
+					   vec_num * ncopies, vectype, i, 1);
+		  signed char biasval
+		    = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+		  bias = build_int_cst (intQI_type_node, biasval);
+		}
+	      else
+		{
+		  len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
+		  bias = build_int_cst (intQI_type_node, 0);
+		}
+	    }
 	  if (masks)
 	    {
 	      tree loop_mask
@@ -11950,9 +11983,14 @@  vectorizable_condition (vec_info *vinfo,
 	{
 	  gimple *old_stmt = vect_orig_stmt (stmt_info)->stmt;
 	  tree lhs = gimple_get_lhs (old_stmt);
-	  new_stmt = gimple_build_call_internal
-	      (IFN_FOLD_EXTRACT_LAST, 3, else_clause, vec_compare,
-	       vec_then_clause);
+	  if (len)
+	    new_stmt = gimple_build_call_internal
+	        (IFN_LEN_FOLD_EXTRACT_LAST, 5, else_clause, vec_compare,
+	         vec_then_clause, len, bias);
+	  else
+	    new_stmt = gimple_build_call_internal
+	        (IFN_FOLD_EXTRACT_LAST, 3, else_clause, vec_compare,
+	         vec_then_clause);
 	  gimple_call_set_lhs (new_stmt, lhs);
 	  SSA_NAME_DEF_STMT (lhs) = new_stmt;
 	  if (old_stmt == gsi_stmt (*gsi))