[2/3] RISC-V: setmem for RISCV with V extension

Message ID 20231211094728.1623032-3-slewis@rivosinc.com
State Unresolved
Headers
Series RISC-V: vectorised memory operations |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

Sergei Lewis Dec. 11, 2023, 9:47 a.m. UTC
  gcc/ChangeLog

    * config/riscv/riscv-protos.h (riscv_vector::expand_vec_setmem): New function
    declaration.

    * config/riscv/riscv-string.cc (riscv_vector::expand_vec_setmem): New
    function: this generates an inline vectorised memory set, if and only if we
    know the entire operation can be performed in a single vector store

    * config/riscv/riscv.md (setmem<mode>): Try riscv_vector::expand_vec_setmem
    for constant lengths

gcc/testsuite/ChangeLog
    * gcc.target/riscv/rvv/base/setmem-1.c: New tests
---
 gcc/config/riscv/riscv-protos.h               |  1 +
 gcc/config/riscv/riscv-string.cc              | 82 +++++++++++++++
 gcc/config/riscv/riscv.md                     | 14 +++
 .../gcc.target/riscv/rvv/base/setmem-1.c      | 99 +++++++++++++++++++
 4 files changed, 196 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c
  

Comments

Kito Cheng Dec. 11, 2023, 1:38 p.m. UTC | #1
On Mon, Dec 11, 2023 at 5:48 PM Sergei Lewis <slewis@rivosinc.com> wrote:
>
> gcc/ChangeLog
>
>     * config/riscv/riscv-protos.h (riscv_vector::expand_vec_setmem): New function
>     declaration.
>
>     * config/riscv/riscv-string.cc (riscv_vector::expand_vec_setmem): New
>     function: this generates an inline vectorised memory set, if and only if we
>     know the entire operation can be performed in a single vector store
>
>     * config/riscv/riscv.md (setmem<mode>): Try riscv_vector::expand_vec_setmem
>     for constant lengths
>
> gcc/testsuite/ChangeLog
>     * gcc.target/riscv/rvv/base/setmem-1.c: New tests
> ---
>  gcc/config/riscv/riscv-protos.h               |  1 +
>  gcc/config/riscv/riscv-string.cc              | 82 +++++++++++++++
>  gcc/config/riscv/riscv.md                     | 14 +++
>  .../gcc.target/riscv/rvv/base/setmem-1.c      | 99 +++++++++++++++++++
>  4 files changed, 196 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c
>
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index 20bbb5b859c..950cb65c910 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -560,6 +560,7 @@ void expand_popcount (rtx *);
>  void expand_rawmemchr (machine_mode, rtx, rtx, rtx, bool = false);
>  bool expand_strcmp (rtx, rtx, rtx, rtx, unsigned HOST_WIDE_INT, bool);
>  void emit_vec_extract (rtx, rtx, poly_int64);
> +bool expand_vec_setmem (rtx, rtx, rtx, rtx);
>
>  /* Rounding mode bitfield for fixed point VXRM.  */
>  enum fixed_point_rounding_mode
> diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
> index 11c1f74d0b3..0abbd5f8b28 100644
> --- a/gcc/config/riscv/riscv-string.cc
> +++ b/gcc/config/riscv/riscv-string.cc
> @@ -1247,4 +1247,86 @@ expand_strcmp (rtx result, rtx src1, rtx src2, rtx nbytes,
>    return true;
>  }
>
> +
> +/* Select appropriate LMUL for a single vector operation based on
> +   byte size of data to be processed.
> +   On success, return true and populate lmul_out.
> +   If length_in is too wide for a single vector operation, return false
> +   and leave lmul_out unchanged.  */
> +
> +static bool
> +select_appropriate_lmul (HOST_WIDE_INT length_in,
> +    HOST_WIDE_INT &lmul_out)
> +{
> +  /* if it's tiny, default operation is likely better; maybe worth
> +     considering fractional lmul in the future as well.  */
> +  if (length_in < (TARGET_MIN_VLEN/8))

(TARGET_MIN_VLEN / 8)

> +    return false;
> +
> +  /* find smallest lmul large enough for entire op.  */
> +  HOST_WIDE_INT lmul = 1;
> +  while ((lmul <= 8) && (length_in > ((lmul*TARGET_MIN_VLEN)/8)))

 ((lmu l *TARGET_MIN_VLEN) / 8)))

> +    {
> +      lmul <<= 1;
> +    }
> +
> +  if (lmul > 8)
> +    return false;
> +
> +  lmul_out = lmul;
> +  return true;
> +}
> +
> +/* Used by setmemdi in riscv.md.  */
> +bool
> +expand_vec_setmem (rtx dst_in, rtx length_in, rtx fill_value_in,
> +         rtx alignment_in)
> +{
> +  /* we're generating vector code.  */
> +  if (!TARGET_VECTOR)
> +    return false;
> +  /* if we can't reason about the length, let libc handle the operation.  */
> +  if (!CONST_INT_P (length_in))
> +    return false;
> +
> +  HOST_WIDE_INT length = INTVAL (length_in);
> +  HOST_WIDE_INT lmul;
> +
> +  /* select an lmul such that the data just fits into one vector operation;
> +     bail if we can't.  */
> +  if (!select_appropriate_lmul (length, lmul))
> +    return false;
> +
> +  machine_mode vmode = riscv_vector::get_vector_mode (QImode,
> +         BYTES_PER_RISCV_VECTOR * lmul).require ();
> +  rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0));
> +  rtx dst = change_address (dst_in, vmode, dst_addr);
> +
> +  rtx fill_value = gen_reg_rtx (vmode);
> +  rtx broadcast_ops[] = {fill_value, fill_value_in};
> +
> +  /* If the length is exactly vlmax for the selected mode, do that.
> +     Otherwise, use a predicated store.  */
> +  if (known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in)))
> +    {
> +      emit_vlmax_insn (code_for_pred_broadcast (vmode),
> +             UNARY_OP, broadcast_ops);
> +      emit_move_insn (dst, fill_value);
> +    }
> +  else
> +    {
> +      if (!satisfies_constraint_K (length_in))
> +             length_in= force_reg (Pmode, length_in);
> +      emit_nonvlmax_insn (code_for_pred_broadcast (vmode), UNARY_OP,
> +             broadcast_ops, length_in);
> +      machine_mode mask_mode = riscv_vector::get_vector_mode
> +             (BImode, GET_MODE_NUNITS (vmode)).require ();
> +      rtx mask =  CONSTM1_RTX (mask_mode);
> +      emit_insn (gen_pred_store (vmode, dst, mask, fill_value, length_in,
> +             get_avl_type_rtx (riscv_vector::NONVLMAX)));
> +    }
> +
> +  return true;
> +}
> +
>  }
> diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> index 88fde290a8a..29d3b1aa342 100644
> --- a/gcc/config/riscv/riscv.md
> +++ b/gcc/config/riscv/riscv.md
> @@ -2381,6 +2381,20 @@
>      FAIL;
>  })
>
> +(define_expand "setmemsi"
> +  [(set (match_operand:BLK 0 "memory_operand")     ;; Dest
> +             (match_operand:QI  2 "nonmemory_operand")) ;; Value
> +   (use (match_operand:SI  1 "const_int_operand")) ;; Length
> +   (match_operand:SI       3 "const_int_operand")] ;; Align
> +  "TARGET_VECTOR"
> +{
> +  if (riscv_vector::expand_vec_setmem (operands[0], operands[1], operands[2],
> +      operands[3]))
> +    DONE;
> +  else
> +    FAIL;
> +})
> +
>  ;; Expand in-line code to clear the instruction cache between operand[0] and
>  ;; operand[1].
>  (define_expand "clear_cache"
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c
> new file mode 100644
> index 00000000000..d1a5ff002a9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c
> @@ -0,0 +1,99 @@
> +/* { dg-do compile } */
> +/* { dg-add-options riscv_v } */
> +/* { dg-additional-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +#include <string.h>

Drop this to prevent multilib testing issues.

> +
> +#define MIN_VECTOR_BYTES (__riscv_v_min_vlen/8)
> +
> +/* tiny memsets should use scalar ops
> +** f1:
> +**  sb\s+a1,0\(a0\)
> +**  ret
> +*/
> +void * f1 (void *a, int const b)
> +{
> +  return memset (a, b, 1);

__builtin_memset instead memset

> +}
> +
> +/* tiny memsets should use scalar ops
> +** f2:
> +**  sb\s+a1,0\(a0\)
> +**  sb\s+a1,1\(a0\)
> +**  ret
> +*/
> +void * f2 (void *a, int const b)
> +{
> +  return memset (a, b, 2);

Ditto.

> +}
> +
> +/* tiny memsets should use scalar ops
> +** f3:
> +**  sb\s+a1,0\(a0\)
> +**  sb\s+a1,1\(a0\)
> +**  sb\s+a1,2\(a0\)
> +**  ret
> +*/
> +void * f3 (void *a, int const b)
> +{
> +  return memset (a, b, 3);

Ditto.

> +}
> +
> +/* vectorise+inline minimum vector register width with LMUL=1
> +** f4:
> +**  (
> +**  vsetivli\s+zero,\d+,e8,m1,ta,ma
> +**  |
> +**  li\s+a\d+,\d+
> +**  vsetvli\s+zero,a\d+,e8,m1,ta,ma
> +**  )
> +**  vmv\.v\.x\s+v\d+,a1
> +**  vse8\.v\s+v\d+,0\(a0\)
> +**  ret
> +*/
> +void * f4 (void *a, int const b)
> +{
> +  return memset (a, b, MIN_VECTOR_BYTES);

Ditto.

> +}
> +
> +/* vectorised code should use smallest lmul known to fit length
> +** f5:
> +**  (
> +**  vsetivli\s+zero,\d+,e8,m2,ta,ma
> +**  |
> +**  li\s+a\d+,\d+
> +**  vsetvli\s+zero,a\d+,e8,m2,ta,ma
> +**  )
> +**  vmv\.v\.x\s+v\d+,a1
> +**  vse8\.v\s+v\d+,0\(a0\)
> +**  ret
> +*/
> +void * f5 (void *a, int const b)
> +{
> +  return memset (a, b, MIN_VECTOR_BYTES+1);

Ditto.

> +}
> +
> +/* vectorise+inline up to LMUL=8
> +** f6:
> +**  li\s+a\d+,\d+
> +**  vsetvli\s+zero,a\d+,e8,m8,ta,ma
> +**  vmv\.v\.x\s+v\d+,a1
> +**  vse8\.v\s+v\d+,0\(a0\)
> +**  ret
> +*/
> +void * f6 (void *a, int const b)
> +{
> +  return memset (a, b, MIN_VECTOR_BYTES*8);

Ditto.

> +}
> +
> +/* don't vectorise if the move is too large for one operation
> +** f7:
> +**  li\s+a2,\d+
> +**  tail\s+memset
> +*/
> +void * f7 (void *a, int const b)
> +{
> +  return memset (a, b, MIN_VECTOR_BYTES*8+1);

Ditto.

> +}
> +
> --
> 2.34.1
>
  

Patch

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 20bbb5b859c..950cb65c910 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -560,6 +560,7 @@  void expand_popcount (rtx *);
 void expand_rawmemchr (machine_mode, rtx, rtx, rtx, bool = false);
 bool expand_strcmp (rtx, rtx, rtx, rtx, unsigned HOST_WIDE_INT, bool);
 void emit_vec_extract (rtx, rtx, poly_int64);
+bool expand_vec_setmem (rtx, rtx, rtx, rtx);
 
 /* Rounding mode bitfield for fixed point VXRM.  */
 enum fixed_point_rounding_mode
diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
index 11c1f74d0b3..0abbd5f8b28 100644
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -1247,4 +1247,86 @@  expand_strcmp (rtx result, rtx src1, rtx src2, rtx nbytes,
   return true;
 }
 
+
+/* Select appropriate LMUL for a single vector operation based on
+   byte size of data to be processed.
+   On success, return true and populate lmul_out.
+   If length_in is too wide for a single vector operation, return false
+   and leave lmul_out unchanged.  */
+
+static bool
+select_appropriate_lmul (HOST_WIDE_INT length_in,
+    HOST_WIDE_INT &lmul_out)
+{
+  /* if it's tiny, default operation is likely better; maybe worth
+     considering fractional lmul in the future as well.  */
+  if (length_in < (TARGET_MIN_VLEN/8))
+    return false;
+
+  /* find smallest lmul large enough for entire op.  */
+  HOST_WIDE_INT lmul = 1;
+  while ((lmul <= 8) && (length_in > ((lmul*TARGET_MIN_VLEN)/8)))
+    {
+      lmul <<= 1;
+    }
+
+  if (lmul > 8)
+    return false;
+
+  lmul_out = lmul;
+  return true;
+}
+
+/* Used by setmemdi in riscv.md.  */
+bool
+expand_vec_setmem (rtx dst_in, rtx length_in, rtx fill_value_in,
+	  rtx alignment_in)
+{
+  /* we're generating vector code.  */
+  if (!TARGET_VECTOR)
+    return false;
+  /* if we can't reason about the length, let libc handle the operation.  */
+  if (!CONST_INT_P (length_in))
+    return false;
+
+  HOST_WIDE_INT length = INTVAL (length_in);
+  HOST_WIDE_INT lmul;
+
+  /* select an lmul such that the data just fits into one vector operation;
+     bail if we can't.  */
+  if (!select_appropriate_lmul (length, lmul))
+    return false;
+
+  machine_mode vmode = riscv_vector::get_vector_mode (QImode,
+	  BYTES_PER_RISCV_VECTOR * lmul).require ();
+  rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0));
+  rtx dst = change_address (dst_in, vmode, dst_addr);
+
+  rtx fill_value = gen_reg_rtx (vmode);
+  rtx broadcast_ops[] = {fill_value, fill_value_in};
+
+  /* If the length is exactly vlmax for the selected mode, do that.
+     Otherwise, use a predicated store.  */
+  if (known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in)))
+    {
+      emit_vlmax_insn (code_for_pred_broadcast (vmode),
+	      UNARY_OP, broadcast_ops);
+      emit_move_insn (dst, fill_value);
+    }
+  else
+    {
+      if (!satisfies_constraint_K (length_in))
+	      length_in= force_reg (Pmode, length_in);
+      emit_nonvlmax_insn (code_for_pred_broadcast (vmode), UNARY_OP,
+	      broadcast_ops, length_in);
+      machine_mode mask_mode = riscv_vector::get_vector_mode
+	      (BImode, GET_MODE_NUNITS (vmode)).require ();
+      rtx mask =  CONSTM1_RTX (mask_mode);
+      emit_insn (gen_pred_store (vmode, dst, mask, fill_value, length_in,
+	      get_avl_type_rtx (riscv_vector::NONVLMAX)));
+    }
+
+  return true;
+}
+
 }
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 88fde290a8a..29d3b1aa342 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -2381,6 +2381,20 @@ 
     FAIL;
 })
 
+(define_expand "setmemsi"
+  [(set (match_operand:BLK 0 "memory_operand")     ;; Dest
+	      (match_operand:QI  2 "nonmemory_operand")) ;; Value
+   (use (match_operand:SI  1 "const_int_operand")) ;; Length
+   (match_operand:SI       3 "const_int_operand")] ;; Align
+  "TARGET_VECTOR"
+{
+  if (riscv_vector::expand_vec_setmem (operands[0], operands[1], operands[2],
+      operands[3]))
+    DONE;
+  else
+    FAIL;
+})
+
 ;; Expand in-line code to clear the instruction cache between operand[0] and
 ;; operand[1].
 (define_expand "clear_cache"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c
new file mode 100644
index 00000000000..d1a5ff002a9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c
@@ -0,0 +1,99 @@ 
+/* { dg-do compile } */
+/* { dg-add-options riscv_v } */
+/* { dg-additional-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <string.h>
+
+#define MIN_VECTOR_BYTES (__riscv_v_min_vlen/8)
+
+/* tiny memsets should use scalar ops
+** f1:
+**  sb\s+a1,0\(a0\)
+**  ret
+*/
+void * f1 (void *a, int const b)
+{
+  return memset (a, b, 1);
+}
+
+/* tiny memsets should use scalar ops
+** f2:
+**  sb\s+a1,0\(a0\)
+**  sb\s+a1,1\(a0\)
+**  ret
+*/
+void * f2 (void *a, int const b)
+{
+  return memset (a, b, 2);
+}
+
+/* tiny memsets should use scalar ops
+** f3:
+**  sb\s+a1,0\(a0\)
+**  sb\s+a1,1\(a0\)
+**  sb\s+a1,2\(a0\)
+**  ret
+*/
+void * f3 (void *a, int const b)
+{
+  return memset (a, b, 3);
+}
+
+/* vectorise+inline minimum vector register width with LMUL=1
+** f4:
+**  (
+**  vsetivli\s+zero,\d+,e8,m1,ta,ma
+**  |
+**  li\s+a\d+,\d+
+**  vsetvli\s+zero,a\d+,e8,m1,ta,ma
+**  )
+**  vmv\.v\.x\s+v\d+,a1
+**  vse8\.v\s+v\d+,0\(a0\)
+**  ret
+*/
+void * f4 (void *a, int const b)
+{
+  return memset (a, b, MIN_VECTOR_BYTES);
+}
+
+/* vectorised code should use smallest lmul known to fit length
+** f5:
+**  (
+**  vsetivli\s+zero,\d+,e8,m2,ta,ma
+**  |
+**  li\s+a\d+,\d+
+**  vsetvli\s+zero,a\d+,e8,m2,ta,ma
+**  )
+**  vmv\.v\.x\s+v\d+,a1
+**  vse8\.v\s+v\d+,0\(a0\)
+**  ret
+*/
+void * f5 (void *a, int const b)
+{
+  return memset (a, b, MIN_VECTOR_BYTES+1);
+}
+
+/* vectorise+inline up to LMUL=8
+** f6:
+**  li\s+a\d+,\d+
+**  vsetvli\s+zero,a\d+,e8,m8,ta,ma
+**  vmv\.v\.x\s+v\d+,a1
+**  vse8\.v\s+v\d+,0\(a0\)
+**  ret
+*/
+void * f6 (void *a, int const b)
+{
+  return memset (a, b, MIN_VECTOR_BYTES*8);
+}
+
+/* don't vectorise if the move is too large for one operation
+** f7:
+**  li\s+a2,\d+
+**  tail\s+memset
+*/
+void * f7 (void *a, int const b)
+{
+  return memset (a, b, MIN_VECTOR_BYTES*8+1);
+}
+