@@ -492,6 +492,7 @@ bool slide1_sew64_helper (int, machine_mode, machine_mode,
machine_mode, rtx *);
rtx gen_avl_for_scalar_move (rtx);
void expand_tuple_move (rtx *);
+bool expand_block_move (rtx, rtx, rtx);
machine_mode preferred_simd_mode (scalar_mode);
machine_mode get_mask_mode (machine_mode);
void expand_vec_series (rtx, rtx, rtx);
@@ -49,6 +49,7 @@
#include "tm-constrs.h"
#include "rtx-vector-builder.h"
#include "targhooks.h"
+#include "predict.h"
using namespace riscv_vector;
@@ -1991,6 +1992,206 @@ expand_tuple_move (rtx *ops)
}
}
+/* Used by cpymemsi in riscv.md . */
+
+bool
+expand_block_move (rtx dst_in, rtx src_in, rtx length_in)
+{
+ /*
+ memcpy:
+ mv a3, a0 # Copy destination
+ loop:
+ vsetvli t0, a2, e8, m8, ta, ma # Vectors of 8b
+ vle8.v v0, (a1) # Load bytes
+ add a1, a1, t0 # Bump pointer
+ sub a2, a2, t0 # Decrement count
+ vse8.v v0, (a3) # Store bytes
+ add a3, a3, t0 # Bump pointer
+ bnez a2, loop # Any more?
+ ret # Return
+ */
+ if (!TARGET_VECTOR)
+ return false;
+ HOST_WIDE_INT potential_ew
+ = (MIN (MIN (MEM_ALIGN (src_in), MEM_ALIGN (dst_in)), BITS_PER_WORD)
+ / BITS_PER_UNIT);
+ machine_mode vmode = VOIDmode;
+ bool need_loop = true;
+ bool size_p = optimize_function_for_size_p (cfun);
+ rtx src, dst;
+ rtx end = gen_reg_rtx (Pmode);
+ rtx vec;
+ rtx length_rtx = length_in;
+
+ if (CONST_INT_P (length_in))
+ {
+ HOST_WIDE_INT length = INTVAL (length_in);
+
+ /* By using LMUL=8, we can copy as many bytes in one go as there
+ are bits in a vector register. If the entire block thus fits,
+ we don't need a loop. */
+ if (length <= TARGET_MIN_VLEN)
+ {
+ need_loop = false;
+
+ /* If a single scalar load / store pair can do the job, leave it
+ to the scalar code to do that. */
+ /* ??? If fast unaligned access is supported, the scalar code could
+ use suitably sized scalars irrespective of alignemnt. If that
+ gets fixed, we have to adjust the test here. */
+
+ if (pow2p_hwi (length) && length <= potential_ew)
+ return false;
+ }
+
+ /* Find the vector mode to use. Using the largest possible element
+ size is likely to give smaller constants, and thus potentially
+ reducing code size. However, if we need a loop, we need to update
+ the pointers, and that is more complicated with a larger element
+ size, unless we use an immediate, which prevents us from dynamically
+ using the targets transfer size that the hart supports. And then,
+ unless we know the *exact* vector size of the hart, we'd need
+ multiple vsetvli / branch statements, so it's not even a size win.
+ If, in the future, we find an RISCV-V implementation that is slower
+ for small element widths, we might allow larger element widths for
+ loops too. */
+ if (need_loop)
+ potential_ew = 1;
+ for (; potential_ew; potential_ew >>= 1)
+ {
+ scalar_int_mode elem_mode;
+ unsigned HOST_WIDE_INT bits = potential_ew * BITS_PER_UNIT;
+ unsigned HOST_WIDE_INT per_iter;
+ HOST_WIDE_INT nunits;
+
+ if (need_loop)
+ per_iter = TARGET_MIN_VLEN;
+ else
+ per_iter = length;
+ nunits = per_iter / potential_ew;
+
+ /* Unless we get an implementation that's slow for small element
+ size / non-word-aligned accesses, we assume that the hardware
+ handles this well, and we don't want to complicate the code
+ with shifting word contents around or handling extra bytes at
+ the start and/or end. So we want the total transfer size and
+ alignment to fit with the element size. */
+ if (length % potential_ew != 0
+ || !int_mode_for_size (bits, 0).exists (&elem_mode))
+ continue;
+ /* Find the mode to use for the copy inside the loop - or the
+ sole copy, if there is no loop. */
+ if (!need_loop)
+ {
+ /* Try if we have an exact mode for the copy. */
+ if (get_vector_mode (elem_mode, nunits).exists (&vmode))
+ break;
+ /* Since we don't have a mode that exactlty matches the transfer
+ size, we'll need to use pred_store, which is not available
+ for all vector modes, but only iE_RVV_M* modes, hence trying
+ to find a vector mode for a merely rounded-up size is
+ pointless.
+ Still, by choosing a lower LMUL factor that still allows
+ an entire transfer, we can reduce register pressure. */
+ for (unsigned lmul = 1; lmul <= 4; lmul <<= 1)
+ if (TARGET_MIN_VLEN * lmul <= nunits * BITS_PER_UNIT
+ /* Avoid loosing the option of using vsetivli . */
+ && (nunits <= 31 * lmul || nunits > 31 * 8)
+ && (get_vector_mode
+ (elem_mode,
+ exact_div (BYTES_PER_RISCV_VECTOR * lmul,
+ potential_ew)
+ ).exists (&vmode)))
+ break;
+ }
+
+ /* The RVVM8?I modes are notionally 8 * BYTES_PER_RISCV_VECTOR bytes
+ wide. BYTES_PER_RISCV_VECTOR can't be eavenly divided by
+ the sizes of larger element types; the LMUL factor of 8 can at
+ the moment be divided by the SEW, with SEW of up to 8 bytes,
+ but there are reserved encodings so there might be larger
+ SEW in the future. */
+ if (get_vector_mode (elem_mode,
+ exact_div (BYTES_PER_RISCV_VECTOR * 8,
+ potential_ew)).exists (&vmode))
+ break;
+
+ /* We may get here if we tried an element size that's larger than
+ the hardware supports, but we should at least find a suitable
+ byte vector mode. */
+ gcc_assert (potential_ew > 1);
+ }
+ if (potential_ew > 1)
+ length_rtx = GEN_INT (length / potential_ew);
+ }
+ else
+ {
+ vmode = E_RVVM8QImode;
+ }
+
+ /* A memcpy libcall in the worst case takes 3 instructions to prepare the
+ arguments + 1 for the call. When RVV should take 7 instructions and
+ we're optimizing for size a libcall may be preferable. */
+ if (size_p && need_loop)
+ return false;
+
+ /* length_rtx holds the (remaining) length of the required copy.
+ cnt holds the length we copy with the current load/store pair. */
+ rtx cnt = length_rtx;
+ rtx label = NULL_RTX;
+ rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0));
+ rtx src_addr = copy_addr_to_reg (XEXP (src_in, 0));
+
+ if (need_loop)
+ {
+ length_rtx = copy_to_mode_reg (Pmode, length_rtx);
+ cnt = gen_reg_rtx (Pmode);
+ label = gen_label_rtx ();
+
+ emit_label (label);
+ emit_insn (gen_no_side_effects_vsetvl_rtx (vmode, cnt, length_rtx));
+ }
+
+ vec = gen_reg_rtx (vmode);
+ src = change_address (src_in, vmode, src_addr);
+ dst = change_address (dst_in, vmode, dst_addr);
+
+ /* If we don't need a loop and have a suitable mode to describe the size,
+ just do a load / store pair and leave it up to the later lazy code
+ motion pass to insert the appropriate vsetvli. */
+ if (!need_loop && known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in)))
+ {
+ emit_move_insn (vec, src);
+ emit_move_insn (dst, vec);
+ }
+ else
+ {
+ machine_mode mask_mode = get_vector_mode (BImode, GET_MODE_NUNITS (vmode)).require ();
+ rtx mask = CONSTM1_RTX (mask_mode);
+ if (!satisfies_constraint_K (cnt))
+ cnt= force_reg (Pmode, cnt);
+ rtx m_ops[] = {vec, mask, src};
+ emit_nonvlmax_insn (code_for_pred_mov (vmode), UNARY_OP_TAMA,
+ m_ops, cnt);
+ emit_insn (gen_pred_store (vmode, dst, mask, vec, cnt,
+ get_avl_type_rtx (NONVLMAX)));
+ }
+
+ if (need_loop)
+ {
+ emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, cnt)));
+ emit_insn (gen_rtx_SET (dst_addr, gen_rtx_PLUS (Pmode, dst_addr, cnt)));
+ emit_insn (gen_rtx_SET (length_rtx, gen_rtx_MINUS (Pmode, length_rtx, cnt)));
+
+ /* Emit the loop condition. */
+ rtx test = gen_rtx_NE (VOIDmode, end, const0_rtx);
+ emit_jump_insn (gen_cbranch4 (Pmode, test, length_rtx, const0_rtx, label));
+ emit_insn (gen_nop ());
+ }
+
+ return true;
+}
+
/* Return the vectorization machine mode for RVV according to LMUL. */
machine_mode
preferred_simd_mode (scalar_mode mode)
@@ -2271,14 +2271,16 @@
DONE;
})
-(define_expand "cpymemsi"
+(define_expand "cpymem<mode>"
[(parallel [(set (match_operand:BLK 0 "general_operand")
(match_operand:BLK 1 "general_operand"))
- (use (match_operand:SI 2 ""))
+ (use (match_operand:P 2 ""))
(use (match_operand:SI 3 "const_int_operand"))])]
""
{
- if (riscv_expand_block_move (operands[0], operands[1], operands[2]))
+ if (riscv_vector::expand_block_move (operands[0], operands[1], operands[2]))
+ DONE;
+ else if (riscv_expand_block_move (operands[0], operands[1], operands[2]))
DONE;
else
FAIL;
new file mode 100644
@@ -0,0 +1,71 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+/* { dg-add-options riscv_v } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#if 0 /* Using include files when using a multilib-relevant -march option is dicey */
+#include <string.h>
+#else
+extern void *memcpy(void *__restrict dest, const void *__restrict src, __SIZE_TYPE__ n);
+#endif
+
+/* memcpy should be implemented using the cpymem pattern.
+** f1:
+XX \.L\d+: # local label is ignored
+** vsetvli\s+[ta][0-7],a2,e8,m8,ta,ma
+** vle8\.v\s+v\d+,0\(a1\)
+** vse8\.v\s+v\d+,0\(a0\)
+** add\s+a1,a1,[ta][0-7]
+** add\s+a0,a0,[ta][0-7]
+** sub\s+a2,a2,[ta][0-7]
+** bne\s+a2,zero,\.L\d+
+** ret
+*/
+
+void f1 (void *a, void *b, __SIZE_TYPE__ l)
+{
+ memcpy (a, b, l);
+}
+
+/* We should still use cpymem even with slightly different types, as signed
+ overflow is undefined.
+** f2:
+XX \.L\d+: # local label is ignored
+** vsetvli\s+[ta][0-7],a2,e8,m8,ta,ma
+** vle8\.v\s+v\d+,0\(a1\)
+** vse8\.v\s+v\d+,0\(a0\)
+** add\s+a1,a1,[ta][0-7]
+** add\s+a0,a0,[ta][0-7]
+** sub\s+a2,a2,[ta][0-7]
+** bne\s+a2,zero,\.L\d+
+** ret
+*/
+void f2 (__INT32_TYPE__* a, __INT32_TYPE__* b, int l)
+{
+ memcpy (a, b, l);
+}
+
+/* If it is known that the pointer arguments to memcpy point
+ to an aligned object, cpymem can use that alignment.
+ Use extern here so that we get a known alignment, lest
+ DATA_ALIGNMENT force us to make the scan pattern accomodate
+ code for different alignments depending on word size.
+** f3:
+** lui\s+[ta][0-7],%hi\(a_a\)
+** lui\s+[ta][0-7],%hi\(a_b\)
+** addi\s+a4,[ta][0-7],%lo\(a_b\)
+** vsetivli\s+zero,16,e32,m4,ta,ma
+** vle32.v\s+v\d+,0\([ta][0-7]\)
+** addi\s+[ta][0-7],[ta][0-7],%lo\(a_a\)
+** vse32\.v\s+v\d+,0\([ta][0-7]\)
+** ret
+*/
+
+extern struct { __INT32_TYPE__ a[16]; } a_a, a_b;
+
+void f3 ()
+{
+ memcpy (&a_a, &a_b, sizeof a_a);
+}
+
+/* { dg-final { scan-assembler-not {\m(tail|call)\s+memcpy\M} } } */
new file mode 100644
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+/* { dg-add-options riscv_v } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef struct { char c[16]; } c16;
+typedef struct { char c[32]; } c32;
+typedef struct { short s; char c[30]; } s16;
+
+/* A short struct copy can use vsetivli.
+** f1:
+** vsetivli\s+zero,16,e8,m1,ta,ma
+** vle8.v\s+v1,0\(a1\)
+** vse8.v\s+v1,0\(a0\)
+** ret
+*/
+void f1 (c16 *a, c16* b)
+{
+ *a = *b;
+}
+
+/* A longer one needs li.
+** f2:
+** li\s+[ta][0-7],32
+** vsetvli\s+zero,[ta][0-7],e8,m2,ta,ma
+** vle8.v\s+v2,0\(a1\)
+** vse8.v\s+v2,0\(a0\)
+** ret
+*/
+void f2 (c32 *a, c32* b)
+{
+ *a = *b;
+}
+
+/* A 32 byte struct is still short enough for vsetivli
+ if we can use an element width larger than 8.
+** f3:
+** vsetivli\s+zero,16,e16,m2,ta,ma
+** vle16.v\s+v2,0\(a1\)
+** vse16.v\s+v2,0\(a0\)
+** ret
+*/
+void f3 (s16 *a, s16* b)
+{
+ *a = *b;
+}