RISC-V: Enable basic RVV auto-vectorization support

Message ID 20230407012503.65215-1-juzhe.zhong@rivai.ai
State Accepted
Headers
Series RISC-V: Enable basic RVV auto-vectorization support |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

juzhe.zhong@rivai.ai April 7, 2023, 1:25 a.m. UTC
  From: Juzhe-Zhong <juzhe.zhong@rivai.ai>

Enable basic auto-vectorization support of WHILE_LEN/LEN_LOAD/LEN_STORE.
gcc/ChangeLog:

        * config/riscv/riscv-protos.h (preferred_simd_mode): New function.
        (expand_while_len): Ditto.
        * config/riscv/riscv-v.cc (autovec_use_vlmax_p): Ditto.
        (preferred_simd_mode): Ditto.
        (expand_while_len): Ditto.
        * config/riscv/riscv.cc (riscv_convert_vector_bits): Add basic auto-vectorization support.
        (riscv_preferred_simd_mode): New function.
        (TARGET_VECTORIZE_PREFERRED_SIMD_MODE): New targethook for RVV auto-vectorization support.
        * config/riscv/vector.md: Add basic autovec.
        * config/riscv/autovec.md: New file.

---
 gcc/config/riscv/autovec.md     | 63 ++++++++++++++++++++++++++
 gcc/config/riscv/riscv-protos.h |  2 +
 gcc/config/riscv/riscv-v.cc     | 78 +++++++++++++++++++++++++++++++++
 gcc/config/riscv/riscv.cc       | 24 +++++++++-
 gcc/config/riscv/vector.md      |  4 +-
 5 files changed, 169 insertions(+), 2 deletions(-)
 create mode 100644 gcc/config/riscv/autovec.md
  

Comments

Jeff Law April 25, 2023, 6:20 a.m. UTC | #1
On 4/6/23 19:25, juzhe.zhong@rivai.ai wrote:
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
> 
> Enable basic auto-vectorization support of WHILE_LEN/LEN_LOAD/LEN_STORE.
> gcc/ChangeLog:
> 
>          * config/riscv/riscv-protos.h (preferred_simd_mode): New function.
>          (expand_while_len): Ditto.
>          * config/riscv/riscv-v.cc (autovec_use_vlmax_p): Ditto.
>          (preferred_simd_mode): Ditto.
>          (expand_while_len): Ditto.
>          * config/riscv/riscv.cc (riscv_convert_vector_bits): Add basic auto-vectorization support.
>          (riscv_preferred_simd_mode): New function.
>          (TARGET_VECTORIZE_PREFERRED_SIMD_MODE): New targethook for RVV auto-vectorization support.
>          * config/riscv/vector.md: Add basic autovec.
>          * config/riscv/autovec.md: New file.
> 

> +
> +;; len_load/len_store is a sub-optimal pattern for RVV auto-vectorization support.
> +;; We will replace them when len_maskload/len_maskstore is supported in loop vectorizer.
Presumably these are the key primitive you want to build all the basic 
vector memory operations on top of?

We should keep in mind strided accesses which can be important for x264.



> @@ -729,4 +730,81 @@ gen_avl_for_scalar_move (rtx avl)
>       }
>   }
>   
> +/* SCALABLE means that the vector-length is agnostic (run-time invariant and
> +   compile-time unknown). FIXED meands that the vector-length is specific
> +   (compile-time known). Both RVV_SCALABLE and RVV_FIXED_VLMAX are doing
> +   auto-vectorization using VLMAX vsetvl configuration.  */
Typo.  meands -> means.


> +static bool
> +autovec_use_vlmax_p (void)
> +{
> +  return riscv_autovec_preference == RVV_SCALABLE
> +	 || riscv_autovec_preference == RVV_FIXED_VLMAX;
> +}
Formatting nit.  Add parens when you have to wrap lines.

> +
> +/* Return the vectorization machine mode for RVV according to LMUL.  */
mode -> MODE.  In general when referring to a function argument use all 
caps.


> +machine_mode
> +preferred_simd_mode (scalar_mode mode)
> +{
> +  /* We only enable auto-vectorization when TARGET_MIN_VLEN >= 128
> +     which is -march=rv64gcv. Since GCC loop vectorizer report ICE
> +     when we enable -march=rv64gc_zve32* and -march=rv32gc_zve64*.
> +     in the 'can_duplicate_and_interleave_p' of tree-vect-slp.cc. Since we have
> +     VNx1SImode in -march=*zve32* and VNx1DImode in -march=*zve64*, they are
> +     enabled in targetm. vector_mode_supported_p and SLP vectorizer will try to
> +     use them. Currently, we can support auto-vectorization in
> +     -march=rv32_zve32x_zvl128b. Wheras, -march=rv32_zve32x_zvl32b or
> +     -march=rv32_zve32x_zvl64b are disabled.
> + */
> +  if (autovec_use_vlmax_p ())
> +    {
> +      /* If TARGET_MIN_VLEN * riscv_autovec_lmul < 128, we don't allow
> +	 auto-vectorization since Loop Vectorizer may use VNx1SImode or
> +	 VNx1DImode to vectorize which will create ICE in the
> +	 'can_duplicate_and_interleave_p' of tree-vect-slp.cc.  */
> +      if (TARGET_MIN_VLEN * riscv_autovec_lmul < 128)
> +	return word_mode;
> +      /* We use LMUL = 1 as base bytesize which is BYTES_PER_RISCV_VECTOR and
> +	 riscv_autovec_lmul as multiply factor to calculate the the NUNITS to
> +	 get the auto-vectorization mode.  */
> +      poly_uint64 nunits;
> +      poly_uint64 vector_size
> +	= BYTES_PER_RISCV_VECTOR * ((int) riscv_autovec_lmul);
> +      poly_uint64 scalar_size = GET_MODE_SIZE (mode);
> +      if (!multiple_p (vector_size, scalar_size, &nunits))
> +	return word_mode;
> +      machine_mode rvv_mode;
> +      if (get_vector_mode (mode, nunits).exists (&rvv_mode))
> +	return rvv_mode;
> +    }
Is there a reason not to emit a diagnostic when the user asks for a 
configuration we can not currently support?    Are the limitations 
documented in invoke.texi?


> +
> +/* Expand WHILE_LEN pattern. If we can find a mode for a corresponding
> +   NUNITS, we emit vsetvl instructions directly. Otherwise, we emit
> +   UMIN (operand1, NUNITS).  */

> +void
> +expand_while_len (rtx *ops)
> +{
> +  poly_int64 nunits;
> +  gcc_assert (poly_int_rtx_p (ops[2], &nunits));
> +  /* We arbitrary picked QImode as inner scalar mode to get vector mode.
> +     since vsetvl only demand ratio. We let VSETVL PASS to optimize it.  */
> +  scalar_int_mode mode = QImode;
> +  machine_mode rvv_mode;
> +  if (get_vector_mode (mode, nunits).exists (&rvv_mode))
> +    {
> +      rtx vsetvl_rtx
> +	= gen_no_side_effects_vsetvl_rtx (rvv_mode, ops[0], ops[1]);
> +      emit_insn (vsetvl_rtx);
> +    }
> +  else
> +    {
> +      rtx tmp = gen_reg_rtx (Pmode);
> +      emit_move_insn (tmp, gen_int_mode (nunits, Pmode));
> +      expand_binop (Pmode, umin_optab, tmp, ops[1], ops[0], true, OPTAB_LIB);
> +    }
> +}
I thought it had been determined that WHILE_LEN wasn't actually a true 
MIN operation and instead was slightly more complex?  Or did I mis-remember?


No major concerns here.  I'll hold off ACKing pending answers to the 
questions about whether or not we should emit diagnostics, documentation 
for limitations  and the WHILE_LEN question.

Jeff
  

Patch

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
new file mode 100644
index 00000000000..34561383041
--- /dev/null
+++ b/gcc/config/riscv/autovec.md
@@ -0,0 +1,63 @@ 
+;; Machine description for auto-vectorization using RVV for GNU compiler.
+;; Copyright (C) 2023 Free Software Foundation, Inc.
+;; Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd.
+
+;; This file is part of GCC.
+
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;; =========================================================================
+;; == While_len
+;; =========================================================================
+
+(define_expand "while_len<mode>"
+  [(match_operand:P 0 "register_operand")
+   (match_operand:P 1 "vector_length_operand")
+   (match_operand:P 2 "")]
+  "TARGET_VECTOR"
+{
+  riscv_vector::expand_while_len (operands);
+  DONE;
+})
+
+;; =========================================================================
+;; == Loads/Stores
+;; =========================================================================
+
+;; len_load/len_store is a sub-optimal pattern for RVV auto-vectorization support.
+;; We will replace them when len_maskload/len_maskstore is supported in loop vectorizer.
+(define_expand "len_load_<mode>"
+  [(match_operand:V 0 "register_operand")
+   (match_operand:V 1 "memory_operand")
+   (match_operand 2 "vector_length_operand")
+   (match_operand 3 "const_0_operand")]
+  "TARGET_VECTOR"
+{
+  riscv_vector::emit_nonvlmax_op (code_for_pred_mov (<MODE>mode), operands[0],
+				  operands[1], operands[2], <VM>mode);
+  DONE;
+})
+
+(define_expand "len_store_<mode>"
+  [(match_operand:V 0 "memory_operand")
+   (match_operand:V 1 "register_operand")
+   (match_operand 2 "vector_length_operand")
+   (match_operand 3 "const_0_operand")]
+  "TARGET_VECTOR"
+{
+  riscv_vector::emit_nonvlmax_op (code_for_pred_mov (<MODE>mode), operands[0],
+				  operands[1], operands[2], <VM>mode);
+  DONE;
+})
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 4611447ddde..6cd91987199 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -206,6 +206,8 @@  enum vlen_enum
 bool slide1_sew64_helper (int, machine_mode, machine_mode,
 			  machine_mode, rtx *);
 rtx gen_avl_for_scalar_move (rtx);
+machine_mode preferred_simd_mode (scalar_mode);
+void expand_while_len (rtx *);
 }
 
 /* We classify builtin types into two classes:
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index ed3c5e0756f..84d33fcdd14 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -43,6 +43,7 @@ 
 #include "optabs.h"
 #include "tm-constrs.h"
 #include "rtx-vector-builder.h"
+#include "targhooks.h"
 
 using namespace riscv_vector;
 
@@ -729,4 +730,81 @@  gen_avl_for_scalar_move (rtx avl)
     }
 }
 
+/* SCALABLE means that the vector-length is agnostic (run-time invariant and
+   compile-time unknown). FIXED meands that the vector-length is specific
+   (compile-time known). Both RVV_SCALABLE and RVV_FIXED_VLMAX are doing
+   auto-vectorization using VLMAX vsetvl configuration.  */
+static bool
+autovec_use_vlmax_p (void)
+{
+  return riscv_autovec_preference == RVV_SCALABLE
+	 || riscv_autovec_preference == RVV_FIXED_VLMAX;
+}
+
+/* Return the vectorization machine mode for RVV according to LMUL.  */
+machine_mode
+preferred_simd_mode (scalar_mode mode)
+{
+  /* We only enable auto-vectorization when TARGET_MIN_VLEN >= 128
+     which is -march=rv64gcv. Since GCC loop vectorizer report ICE
+     when we enable -march=rv64gc_zve32* and -march=rv32gc_zve64*.
+     in the 'can_duplicate_and_interleave_p' of tree-vect-slp.cc. Since we have
+     VNx1SImode in -march=*zve32* and VNx1DImode in -march=*zve64*, they are
+     enabled in targetm. vector_mode_supported_p and SLP vectorizer will try to
+     use them. Currently, we can support auto-vectorization in
+     -march=rv32_zve32x_zvl128b. Wheras, -march=rv32_zve32x_zvl32b or
+     -march=rv32_zve32x_zvl64b are disabled.
+ */
+  if (autovec_use_vlmax_p ())
+    {
+      /* If TARGET_MIN_VLEN * riscv_autovec_lmul < 128, we don't allow
+	 auto-vectorization since Loop Vectorizer may use VNx1SImode or
+	 VNx1DImode to vectorize which will create ICE in the
+	 'can_duplicate_and_interleave_p' of tree-vect-slp.cc.  */
+      if (TARGET_MIN_VLEN * riscv_autovec_lmul < 128)
+	return word_mode;
+      /* We use LMUL = 1 as base bytesize which is BYTES_PER_RISCV_VECTOR and
+	 riscv_autovec_lmul as multiply factor to calculate the the NUNITS to
+	 get the auto-vectorization mode.  */
+      poly_uint64 nunits;
+      poly_uint64 vector_size
+	= BYTES_PER_RISCV_VECTOR * ((int) riscv_autovec_lmul);
+      poly_uint64 scalar_size = GET_MODE_SIZE (mode);
+      if (!multiple_p (vector_size, scalar_size, &nunits))
+	return word_mode;
+      machine_mode rvv_mode;
+      if (get_vector_mode (mode, nunits).exists (&rvv_mode))
+	return rvv_mode;
+    }
+  /* TODO: We will support minimum length VLS auto-vectorization in the future.
+   */
+  return word_mode;
+}
+
+/* Expand WHILE_LEN pattern. If we can find a mode for a corresponding
+   NUNITS, we emit vsetvl instructions directly. Otherwise, we emit
+   UMIN (operand1, NUNITS).  */
+void
+expand_while_len (rtx *ops)
+{
+  poly_int64 nunits;
+  gcc_assert (poly_int_rtx_p (ops[2], &nunits));
+  /* We arbitrary picked QImode as inner scalar mode to get vector mode.
+     since vsetvl only demand ratio. We let VSETVL PASS to optimize it.  */
+  scalar_int_mode mode = QImode;
+  machine_mode rvv_mode;
+  if (get_vector_mode (mode, nunits).exists (&rvv_mode))
+    {
+      rtx vsetvl_rtx
+	= gen_no_side_effects_vsetvl_rtx (rvv_mode, ops[0], ops[1]);
+      emit_insn (vsetvl_rtx);
+    }
+  else
+    {
+      rtx tmp = gen_reg_rtx (Pmode);
+      emit_move_insn (tmp, gen_int_mode (nunits, Pmode));
+      expand_binop (Pmode, umin_optab, tmp, ops[1], ops[0], true, OPTAB_LIB);
+    }
+}
+
 } // namespace riscv_vector
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index b460c8a0b8b..9e507caabb4 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -6217,7 +6217,15 @@  riscv_convert_vector_bits (void)
      to set RVV mode size. The RVV machine modes size are run-time constant if
      TARGET_VECTOR is enabled. The RVV machine modes size remains default
      compile-time constant if TARGET_VECTOR is disabled.  */
-  return TARGET_VECTOR ? poly_uint16 (1, 1) : 1;
+  if (TARGET_VECTOR)
+    {
+      if (riscv_autovec_preference == RVV_FIXED_VLMAX)
+	return (int) TARGET_MIN_VLEN / (riscv_bytes_per_vector_chunk * 8);
+      else
+	return poly_uint16 (1, 1);
+    }
+  else
+    return 1;
 }
 
 /* Implement TARGET_OPTION_OVERRIDE.  */
@@ -7076,6 +7084,17 @@  riscv_shamt_matches_mask_p (int shamt, HOST_WIDE_INT mask)
   return shamt == ctz_hwi (mask);
 }
 
+/* Implement TARGET_VECTORIZE_PREFERRED_SIMD_MODE.  */
+
+static machine_mode
+riscv_preferred_simd_mode (scalar_mode mode)
+{
+  if (TARGET_VECTOR)
+    return riscv_vector::preferred_simd_mode (mode);
+
+  return word_mode;
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
 #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -7327,6 +7346,9 @@  riscv_shamt_matches_mask_p (int shamt, HOST_WIDE_INT mask)
 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE riscv_dwarf_poly_indeterminate_value
 
+#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
+#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE riscv_preferred_simd_mode
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-riscv.h"
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 07624197a5a..9151a4c9891 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -23,7 +23,7 @@ 
 ;; This file include :
 ;;
 ;; - Intrinsics (https://github.com/riscv/rvv-intrinsic-doc)
-;; - Auto-vectorization (TBD)
+;; - Auto-vectorization (autovec.md)
 ;; - Combine optimization (TBD)
 
 (include "vector-iterators.md")
@@ -7688,3 +7688,5 @@ 
   "vle<sew>ff.v\t%0,%3%p1"
   [(set_attr "type" "vldff")
    (set_attr "mode" "<MODE>")])
+
+(include "autovec.md")