@@ -7366,7 +7366,8 @@
;; knows what to generate.
(define_expand "doloop_end"
[(use (match_operand 0 "" "")) ; loop pseudo
- (use (match_operand 1 "" ""))] ; label
+ (use (match_operand 1 "" "")) ; label
+ (use (match_operand 2 "" ""))] ; decrement constant
"optimize > 0 && flag_modulo_sched"
{
rtx s0;
@@ -4985,12 +4985,13 @@ archs4x, archs4xd"
(pc)))
(set (match_dup 0) (plus:SI (match_dup 0) (const_int -1)))
(unspec:SI [(const_int 0)] UNSPEC_ARC_LP)
- (clobber (match_dup 2))])]
+ (clobber (match_dup 3))
+ (match_operand 2 "" "")])]
""
{
if (GET_MODE (operands[0]) != SImode)
FAIL;
- operands[2] = gen_rtx_SCRATCH (SImode);
+ operands[3] = gen_rtx_SCRATCH (SImode);
})
(define_insn "arc_lp"
@@ -63,7 +63,7 @@ extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *);
extern bool arm_q_bit_access (void);
extern bool arm_ge_bits_access (void);
extern bool arm_target_insn_ok_for_lob (rtx);
-
+extern rtx arm_attempt_dlstp_transform (rtx, rtx);
#ifdef RTX_CODE
enum reg_class
arm_mode_base_reg_class (machine_mode);
@@ -470,6 +470,9 @@ static const struct attribute_spec arm_attribute_table[] =
#undef TARGET_SCHED_REORDER
#define TARGET_SCHED_REORDER arm_sched_reorder
+#undef TARGET_ALLOW_ELEMENTWISE_DOLOOP_P
+#define TARGET_ALLOW_ELEMENTWISE_DOLOOP_P arm_allow_elementwise_doloop_p
+
#undef TARGET_REGISTER_MOVE_COST
#define TARGET_REGISTER_MOVE_COST arm_register_move_cost
@@ -34138,8 +34141,370 @@ arm_target_insn_ok_for_lob (rtx insn)
return single_succ_p (bb)
&& single_pred_p (bb)
- && single_succ_edge (bb)->dest == single_pred_edge (bb)->src
- && contains_no_active_insn_p (bb);
+ && single_succ_edge (bb)->dest == single_pred_edge (bb)->src;
+}
+
+static int
+arm_mve_get_vctp_lanes (rtx x)
+{
+ if (GET_CODE (x) == SET && GET_CODE (XEXP (x, 1)) == UNSPEC)
+ {
+ switch (XINT (XEXP (x, 1), 1))
+ {
+ case VCTP8Q:
+ return 16;
+ case VCTP16Q:
+ return 8;
+ case VCTP32Q:
+ return 4;
+ case VCTP64Q:
+ return 2;
+ default:
+ break;
+ }
+ }
+ return 0;
+}
+
+/* Check if an insn requires the use of the VPR_REG, if it does, return the
+ sub-rtx of the matched operand. If there are more than one operand (e.g. an
+ input operand and an output operand) that use VPR_REG, return the first
+ occurance, which is usually the output operand. */
+
+static rtx
+arm_get_required_vpr_reg (rtx_insn *insn)
+{
+ if (!NONJUMP_INSN_P (insn))
+ return NULL_RTX;
+
+ bool requires_vpr;
+ extract_constrain_insn (insn);
+ int n_operands = recog_data.n_operands;
+ if (recog_data.n_alternatives == 0)
+ return NULL_RTX;
+
+ /* Fill in recog_op_alt with information about the constraints of
+ this insn. */
+ preprocess_constraints (insn);
+
+ for (int use = 0; use < n_operands; use++)
+ {
+ requires_vpr = true;
+ /* Iterate through alternatives of operand "use" in recog_op_alt and
+ identify if the operand is required to be the VPR. */
+ for (int alt1 = 0; alt1 < recog_data.n_alternatives; alt1++)
+ {
+ const operand_alternative *op_alt1
+ = &recog_op_alt[alt1 * n_operands];
+ /* Fetch the reg_class for each entry and check it against the
+ * VPR_REG reg_class. */
+ if (alternative_class (op_alt1, use) != VPR_REG)
+ requires_vpr = false;
+ }
+ /* If all alternatives of the insn require the VPR reg for this operand,
+ it means that either this is VPR-generating instruction, like a vctp,
+ vcmp, etc., or it is a VPT-predicated insruction. Return the subrtx
+ of the VPR reg operand. */
+ if (requires_vpr)
+ return recog_data.operand[use];
+ }
+ return NULL_RTX;
+}
+
+/* Scan the basic block of a loop body for a vctp instruction. If there is
+ exactly one unique vctp instruction, return its rtx_insn *. */
+
+static rtx_insn *
+arm_mve_get_loop_unique_vctp (basic_block bb)
+{
+ rtx_insn *insn = BB_HEAD (bb);
+ rtx_insn *vctp_op = NULL;
+
+ /* Now scan through all the instruction patterns and
+ pick out any MVE instructions. */
+ FOR_BB_INSNS (bb, insn)
+ {
+ if (INSN_P (insn))
+ {
+ /* First check if this is a vctp instruction. There needs to be
+ exactly one vctp instruction within the loop. */
+ if (arm_mve_get_vctp_lanes (PATTERN (insn)) != 0)
+ {
+ /* If we already found one vctp instruction, then the
+ loop is not consistent internally. */
+ if (vctp_op)
+ return NULL;
+
+ vctp_op = insn;
+ }
+ }
+ }
+ return vctp_op;
+}
+
+rtx
+arm_attempt_dlstp_transform (rtx label, rtx count)
+{
+ int decrementnum;
+ basic_block body = BLOCK_FOR_INSN (label)->prev_bb;
+ rtx initial_compare;
+ /* Doloop can only be done "elementwise" with predicated dlstp/letp
+ when the iteration counter gets decremented by the number of MVE
+ lanes. This can be extracted from the `count`, which is the expression
+ used to calculate the number of iterations that the loop would execute
+ for a standard dls/le loop. Since we only support cases where this is a
+ power of 2, we can assume that this expression arrives here as:
+ (lshiftrt: (A) (const_int y))
+ Then we can extract the decrementnum from y. */
+ if (GET_CODE (count) == LSHIFTRT && ARITHMETIC_P (XEXP (count, 0))
+ /* There is one final condition that needs to be met for the loop to be
+ transformable: dlstp/letp will continue looping until there are
+ elements still to process. This can only work if the looping ends
+ when the element counter reaches zero and not some other value
+ (e.g. n > 0 works, not n > 1), or we can incorrectly end up running
+ one additional iteration. To by-pass any hoisting that the compiler
+ may have done with the `A` in `count` above, we can instead look up
+ to the bb before the loop preheader: this should end with a cmp+jump
+ pair, where the cmp needs to be with (const_int 0). */
+ && loop_preheader_edge (body->loop_father)->src->prev_bb
+ && BB_END (loop_preheader_edge (body->loop_father)->src->prev_bb)
+ && PREV_INSN (BB_END (loop_preheader_edge (body->loop_father)
+ ->src->prev_bb))
+ && INSN_P (PREV_INSN (BB_END (loop_preheader_edge (body->loop_father)
+ ->src->prev_bb)))
+ && (initial_compare
+ = PATTERN (PREV_INSN (BB_END (loop_preheader_edge (body->loop_father)
+ ->src->prev_bb))))
+ && GET_CODE (initial_compare) == SET
+ && cc_register (XEXP (initial_compare, 0), VOIDmode)
+ && GET_CODE (XEXP (initial_compare, 1)) == COMPARE
+ && CONST_INT_P (XEXP (XEXP (initial_compare, 1), 1))
+ && INTVAL (XEXP (XEXP (initial_compare, 1), 1)) == 0)
+ {
+ /* Extract the integer decrement from the LSHIFTR condition. */
+ decrementnum = (1 << (INTVAL (XEXP (count, 1))));
+ /* Find the vctp predicate generation inside the loop body BB. */
+ rtx_insn *vctp_insn = arm_mve_get_loop_unique_vctp (body);
+ /* If we have successfully found one exactly vctp predicate-generating
+ instruction within the loop and the number by which we deprecate the
+ loop counter in each iteration matches the number of lanes of the
+ vctp instruction, we can attempt to turn this into a dlstp/letp loop.
+ */
+ if (!vctp_insn
+ || decrementnum != arm_mve_get_vctp_lanes (PATTERN (vctp_insn)))
+ return GEN_INT (1);
+
+ rtx_insn *insn = 0;
+ rtx_insn *cur_insn = 0;
+ rtx_insn *seq;
+ rtx vctp_vpr_generated = NULL_RTX;
+ rtx insn_vpr_reg_operand = NULL_RTX;
+ int new_icode;
+
+ /* Scan through the insns in the loop bb and emit the transformed bb
+ insns to a sequence. */
+ start_sequence ();
+ FOR_BB_INSNS (body, insn)
+ {
+ if (GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn))
+ continue;
+ else if (NOTE_P (insn))
+ emit_note ((enum insn_note) NOTE_KIND (insn));
+ else if (!INSN_P (insn))
+ {
+ end_sequence ();
+ return GEN_INT (1);
+ }
+ /* When we find the vctp instruction: This may be followed by
+ a sign-extend insn to SImode. If it is, then save the
+ sign-extended REG into vctp_vpr_generated. If there is no
+ sign-extend, then store the raw output of the vctp.
+ For any VPT-predicated instructions we need to ensure that
+ the VPR they use is the same as the one given here and
+ they often consume the output of a subreg of the SImode
+ sign-extended VPR-reg. As a result, comparing against the
+ output of the sign-extend is more likely to succeed.
+ This code also guarantees to us that the vctp comes before
+ any instructions that use the VPR within the loop, for the
+ dlstp/letp transform to succeed. */
+ else if (insn == vctp_insn)
+ {
+ if (GET_CODE (
+ XEXP (PATTERN (next_nonnote_nondebug_insn_bb (insn)), 1))
+ == SIGN_EXTEND
+ && GET_CODE (
+ XEXP (PATTERN (next_nonnote_nondebug_insn_bb (
+ next_nonnote_nondebug_insn_bb (insn))),
+ 1))
+ == SUBREG)
+ vctp_vpr_generated
+ = XEXP (PATTERN (next_nonnote_nondebug_insn_bb (
+ next_nonnote_nondebug_insn_bb (insn))),
+ 0);
+ else
+ vctp_vpr_generated = XEXP (PATTERN (insn), 0);
+ /* Also emit a USE of the source register of the vctp.
+ This holds the number of elements being processed
+ by the loop. This later gets stored into `count`.
+ */
+ emit_use (XVECEXP (XEXP (PATTERN (insn), 1), 0, 0));
+ continue;
+ }
+ /* If the insn pattern requires the use of the VPR, then it
+ is a VPT-predicated instruction. */
+ else if ((insn_vpr_reg_operand = arm_get_required_vpr_reg (insn))
+ != NULL_RTX)
+ {
+ /* If the VPR value is different to the one generated by
+ the vctp, then fail the conversion. */
+ if (!rtx_equal_p (vctp_vpr_generated, insn_vpr_reg_operand))
+ {
+ end_sequence ();
+ return GEN_INT (1);
+ }
+ /* If the insn does use the same VPR as the one generated
+ by the vctp, it will need to be transformed into its
+ non-predicated version. Also ensure that it's a valid
+ recog-ed instruction with the mve_unpredicated_insn
+ atrribute. */
+ else if (recog_memoized (insn) >= 0
+ && (new_icode = get_attr_mve_unpredicated_insn (insn)))
+ {
+ extract_insn (insn);
+ rtx arr[8];
+ int j = 0;
+
+ /* When transforming a VPT-predicated instruction
+ into its unpredicated equivalent we need to drop
+ the VPR operand and we may need to also drop a
+ merge "vuninit" input operand, depending on the
+ instruction pattern. Here ensure that we have at
+ most a two-operand difference between the two
+ instrunctions. */
+ int n_operands_diff = recog_data.n_operands
+ - insn_data[new_icode].n_operands;
+ gcc_assert (n_operands_diff > 0 && n_operands_diff <= 2);
+
+ /* Then, loop through the operands of the predicated
+ instruction, and retain the ones that map to the
+ unpredicated instruction. */
+ for (int i = 0; i < recog_data.n_operands; i++)
+ {
+ /* Ignore the VPR and, if needed, the vuninit
+ operand. */
+ if (insn_vpr_reg_operand == recog_data.operand[i]
+ || (n_operands_diff == 2
+ && !strcmp (recog_data.constraints[i], "0")))
+ continue;
+ else
+ {
+ arr[j] = recog_data.operand[i];
+ j++;
+ }
+ }
+
+ /* Finally, emit the upredicated instruction. */
+ switch (j)
+ {
+ case 1:
+ emit_insn (GEN_FCN (new_icode) (arr[0]));
+ break;
+ case 2:
+ emit_insn (GEN_FCN (new_icode) (arr[0], arr[1]));
+ break;
+ case 3:
+ emit_insn (GEN_FCN (new_icode) (arr[0], arr[1],
+ arr[2]));
+ break;
+ case 4:
+ emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+ arr[3]));
+ break;
+ case 5:
+ emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+ arr[3], arr[4]));
+ break;
+ case 6:
+ emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+ arr[3], arr[4],
+ arr[5]));
+ break;
+ case 7:
+ emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+ arr[3], arr[4], arr[5],
+ arr[6]));
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ }
+ /* If we can't identify the INSN as either being either
+ for deletion or to re-map, then we don't know how to
+ handle it, so fail the whole conversion. */
+ else
+ {
+ end_sequence ();
+ return GEN_INT (1);
+ }
+ }
+ /* Instructions that dont's require the VPR can be carried
+ over as-is. */
+ else if (DEBUG_INSN_P (insn))
+ emit_debug_insn (PATTERN (insn));
+ else
+ emit_insn (PATTERN (insn));
+ }
+ seq = get_insns ();
+ end_sequence ();
+
+ /* Re-write the entire BB contents with the transformed
+ sequence. */
+ FOR_BB_INSNS_SAFE (body, insn, cur_insn)
+ if (!(GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn)))
+ delete_insn (insn);
+ for (insn = seq; NEXT_INSN (insn); insn = NEXT_INSN (insn))
+ if (NOTE_P (insn))
+ emit_note_after ((enum insn_note) NOTE_KIND (insn), BB_END (body));
+ else if (DEBUG_INSN_P (insn))
+ emit_debug_insn_after (PATTERN (insn), BB_END (body));
+ else
+ emit_insn_after (PATTERN (insn), BB_END (body));
+
+ emit_jump_insn_after (PATTERN (insn), BB_END (body));
+ return GEN_INT (decrementnum);
+ }
+ /* Bail out: we can't use dlstp/letp, so return 1 to allow loop-doloop to try
+ the standard dls/le pair. */
+ return GEN_INT (1);
+}
+
+/* Target hook to the number of elements to be processed by a dlstp/letp loop
+ into `count` to intialise the counter register. The number of elements was
+ previously extracted from the vctp insn and placed into a USE rtx.
+ We only check that the doloop_end pattern successfully decrements by a
+ number other than -1 for a valid dlstp/letp loop. No other checking is
+ needed as that was done previously. */
+
+rtx
+arm_allow_elementwise_doloop_p (rtx count, rtx label, rtx doloop)
+{
+ if (doloop
+ && INTVAL (XEXP (SET_SRC (XVECEXP (PATTERN (doloop), 0, 1)), 1)) != -1
+ && ARITHMETIC_P (XEXP (count, 0)))
+ {
+ basic_block body = BLOCK_FOR_INSN (label)->prev_bb;
+ rtx_insn* insn;
+ FOR_BB_INSNS (body, insn)
+ {
+ if (INSN_P (insn) && GET_CODE (PATTERN (insn)) == USE)
+ {
+ rtx num_elem_reg = copy_rtx (XEXP (PATTERN (insn), 0));
+ delete_insn (insn);
+ return num_elem_reg;
+ }
+ }
+ }
+ return count;
}
#if CHECKING_P
@@ -1464,7 +1464,9 @@
(define_int_attr mode1 [(VCTP8Q "8") (VCTP16Q "16") (VCTP32Q "32")
(VCTP64Q "64") (VCTP8Q_M "8") (VCTP16Q_M "16")
- (VCTP32Q_M "32") (VCTP64Q_M "64")])
+ (VCTP32Q_M "32") (VCTP64Q_M "64")
+ (DLSTP8 "8") (DLSTP16 "16") (DLSTP32 "32")
+ (DLSTP64 "64")])
;; Both kinds of return insn.
(define_code_iterator RETURNS [return simple_return])
@@ -1773,6 +1775,8 @@
(define_int_iterator UQRSHLLQ [UQRSHLL_64 UQRSHLL_48])
(define_int_iterator SQRSHRLQ [SQRSHRL_64 SQRSHRL_48])
(define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U])
+(define_int_iterator DLSTP [DLSTP8 DLSTP16 DLSTP32
+ DLSTP64])
;; Define iterators for VCMLA operations
(define_int_iterator VCMLA_OP [UNSPEC_VCMLA
@@ -10837,3 +10837,38 @@
}
DONE;
})
+
+;; Originally expanded by 'predicated_doloop_end'.
+(define_insn "*predicated_doloop_end_internal"
+ [(set (pc)
+ (if_then_else
+ (ge (plus:SI (reg:SI LR_REGNUM)
+ (match_operand:SI 0 "const_int_operand" ""))
+ (const_int 0))
+ (label_ref (match_operand 1 "" ""))
+ (pc)))
+ (set (reg:SI LR_REGNUM)
+ (plus:SI (reg:SI LR_REGNUM) (match_dup 0)))
+ (clobber (reg:CC CC_REGNUM))]
+ "TARGET_32BIT && TARGET_HAVE_LOB && TARGET_HAVE_MVE && TARGET_THUMB2"
+ {
+ if (get_attr_length (insn) == 4)
+ return "letp\t%|lr, %l1";
+ else
+ return "subs\t%|lr, #%0;bgt\t%l1";
+ }
+ [(set (attr "length")
+ (if_then_else
+ (ltu (minus (pc) (match_dup 1)) (const_int 1024))
+ (const_int 4)
+ (const_int 6)))
+ (set_attr "type" "branch")])
+
+(define_insn "dlstp<mode1>_insn"
+ [
+ (set (reg:SI LR_REGNUM)
+ (unspec:SI [(match_operand:SI 0 "s_register_operand" "r")]
+ DLSTP))
+ ]
+ "TARGET_32BIT && TARGET_HAVE_LOB && TARGET_HAVE_MVE && TARGET_THUMB2"
+ "dlstp.<mode1>\t%|lr, %0")
\ No newline at end of file
@@ -1610,10 +1610,11 @@
;; knows what to generate.
(define_expand "doloop_end"
[(use (match_operand 0 "" "")) ; loop pseudo
- (use (match_operand 1 "" ""))] ; label
+ (use (match_operand 1 "" "")) ; label
+ (use (match_operand 2 "" ""))] ; decrement constant
"TARGET_32BIT"
"
- {
+{
/* Currently SMS relies on the do-loop pattern to recognize loops
where (1) the control part consists of all insns defining and/or
using a certain 'count' register and (2) the loop count can be
@@ -1623,41 +1624,68 @@
Also used to implement the low over head loops feature, which is part of
the Armv8.1-M Mainline Low Overhead Branch (LOB) extension. */
- if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB))
- {
- rtx s0;
- rtx bcomp;
- rtx loc_ref;
- rtx cc_reg;
- rtx insn;
- rtx cmp;
-
- if (GET_MODE (operands[0]) != SImode)
- FAIL;
-
- s0 = operands [0];
-
- /* Low over head loop instructions require the first operand to be LR. */
- if (TARGET_HAVE_LOB && arm_target_insn_ok_for_lob (operands [1]))
- s0 = gen_rtx_REG (SImode, LR_REGNUM);
-
- if (TARGET_THUMB2)
- insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, GEN_INT (-1)));
- else
- insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1)));
-
- cmp = XVECEXP (PATTERN (insn), 0, 0);
- cc_reg = SET_DEST (cmp);
- bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
- loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands [1]);
- emit_jump_insn (gen_rtx_SET (pc_rtx,
- gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
- loc_ref, pc_rtx)));
- DONE;
- }
- else
- FAIL;
- }")
+ if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB))
+ {
+ rtx s0;
+ rtx bcomp;
+ rtx loc_ref;
+ rtx cc_reg;
+ rtx insn;
+ rtx cmp;
+ rtx decrement_num;
+
+ if (GET_MODE (operands[0]) != SImode)
+ FAIL;
+
+ s0 = operands[0];
+
+ if (TARGET_HAVE_LOB && arm_target_insn_ok_for_lob (operands[1]))
+ {
+ s0 = gen_rtx_REG (SImode, LR_REGNUM);
+
+ /* If we have a compatibe MVE target, try and analyse the loop
+ contents to determine if we can use predicated dlstp/letp
+ looping. */
+ if (TARGET_HAVE_MVE && TARGET_THUMB2
+ && (decrement_num = arm_attempt_dlstp_transform (operands[1],
+ operands[2]))
+ && (INTVAL (decrement_num) != 1))
+ {
+ insn = emit_insn
+ (gen_thumb2_addsi3_compare0
+ (s0, s0, GEN_INT ((-1) * (INTVAL (decrement_num)))));
+ cmp = XVECEXP (PATTERN (insn), 0, 0);
+ cc_reg = SET_DEST (cmp);
+ bcomp = gen_rtx_GE (VOIDmode, cc_reg, const0_rtx);
+ loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]);
+ emit_jump_insn (gen_rtx_SET (pc_rtx,
+ gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
+ loc_ref, pc_rtx)));
+ DONE;
+ }
+
+ /* Otherwise, try standard decrement-by-one dls/le looping. */
+ if (TARGET_THUMB2)
+ insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0,
+ GEN_INT (-1)));
+ else
+ insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1)));
+
+ cmp = XVECEXP (PATTERN (insn), 0, 0);
+ cc_reg = SET_DEST (cmp);
+ bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
+ loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]);
+ emit_jump_insn (gen_rtx_SET (pc_rtx,
+ gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
+ loc_ref, pc_rtx)));
+ DONE;
+ }
+ else
+ FAIL;
+ }
+ else
+ FAIL;
+}")
(define_insn "*clear_apsr"
[(unspec_volatile:SI [(const_int 0)] VUNSPEC_CLRM_APSR)
@@ -1755,7 +1783,37 @@
{
if (REGNO (operands[0]) == LR_REGNUM)
{
- emit_insn (gen_dls_insn (operands[0]));
+ /* Pick out the number by which we are decrementing the loop counter
+ in every iteration. If it's > 1, then use dlstp. */
+ int const_int_dec_num
+ = abs (INTVAL (XEXP (XEXP (XVECEXP (PATTERN (operands[1]), 0, 1),
+ 1),
+ 1)));
+ switch (const_int_dec_num)
+ {
+ case 16:
+ emit_insn (gen_dlstp8_insn (operands[0]));
+ break;
+
+ case 8:
+ emit_insn (gen_dlstp16_insn (operands[0]));
+ break;
+
+ case 4:
+ emit_insn (gen_dlstp32_insn (operands[0]));
+ break;
+
+ case 2:
+ emit_insn (gen_dlstp64_insn (operands[0]));
+ break;
+
+ case 1:
+ emit_insn (gen_dls_insn (operands[0]));
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
DONE;
}
else
@@ -579,6 +579,10 @@
VCTP16Q
VCTP32Q
VCTP64Q
+ DLSTP8
+ DLSTP16
+ DLSTP32
+ DLSTP64
VPNOT
VCREATEQ_F
VCVTQ_N_TO_F_S
@@ -1959,7 +1959,8 @@
(plus:SI (match_dup 0)
(const_int -1)))
(unspec [(const_int 0)] UNSPEC_LSETUP_END)
- (clobber (match_dup 2))
+ (clobber (match_dup 3))
+ (match_operand 2 "" "")
(clobber (reg:BI REG_CC))])] ; match_scratch
""
{
@@ -1967,7 +1968,7 @@
if (GET_MODE (operands[0]) != SImode)
FAIL;
bfin_hardware_loop ();
- operands[2] = gen_rtx_SCRATCH (SImode);
+ operands[3] = gen_rtx_SCRATCH (SImode);
})
(define_insn "loop_end"
@@ -1429,13 +1429,14 @@
(set (match_dup 0)
(plus:SI (match_dup 0)
(const_int -1)))
- (clobber (match_dup 2))])] ; match_scratch
+ (clobber (match_dup 3)) ; match_scratch
+ (match_operand 2 "" "")])]
"TARGET_INSNS_64PLUS && optimize"
{
/* The loop optimizer doesn't check the predicates... */
if (GET_MODE (operands[0]) != SImode)
FAIL;
- operands[2] = gen_rtx_SCRATCH (SImode);
+ operands[3] = gen_rtx_SCRATCH (SImode);
})
(define_insn "mvilc"
@@ -3956,7 +3956,8 @@
(define_expand "doloop_end"
[(use (match_operand 0 "" "")) ; loop pseudo
- (use (match_operand 1 "" ""))] ; label
+ (use (match_operand 1 "" "")) ; label
+ (use (match_operand 2 "" ""))] ; decrement constant
""
{
if (GET_MODE (operands[0]) != DImode)
@@ -332,7 +332,8 @@
(pc)))
(set (match_dup 0)
(plus:HI (match_dup 0)
- (const_int -1)))])]
+ (const_int -1)))
+ (match_operand 2 "" "")])]
"TARGET_40_PLUS"
"{
if (GET_MODE (operands[0]) != HImode)
@@ -1636,7 +1636,8 @@
(define_expand "doloop_end"
[(use (match_operand 0 "nonimmediate_operand"))
- (use (label_ref (match_operand 1 "")))]
+ (use (label_ref (match_operand 1 "")))
+ (use (match_operand 2 "" ""))] ; decrement constant
"TARGET_OPT_LOOP"
{
if (GET_CODE (operands[0]) == REG && GET_MODE (operands[0]) == QImode)
@@ -13422,7 +13422,8 @@
(define_expand "doloop_end"
[(use (match_operand 0)) ; loop pseudo
- (use (match_operand 1))] ; label
+ (use (match_operand 1)) ; label
+ (use (match_operand 2 "" ""))] ; decrement constant
""
{
if (GET_MODE (operands[0]) != Pmode)
@@ -9780,7 +9780,8 @@
(define_expand "doloop_end"
[(use (match_operand 0 "" "")) ; loop pseudo
- (use (match_operand 1 "" ""))] ; label
+ (use (match_operand 1 "" "")) ; label
+ (use (match_operand 2 "" ""))] ; decrement constant
""
{
if (GET_MODE (operands[0]) == SImode)
@@ -6404,7 +6404,8 @@
(pc)))
(set (match_dup 0)
(plus:SI (match_dup 0) (const_int -1)))
- (clobber (reg:SI T_REG))])]
+ (clobber (reg:SI T_REG))
+ (match_operand 2 "" "")])]
"TARGET_SH2"
{
if (GET_MODE (operands[0]) != SImode)
@@ -1434,7 +1434,8 @@
(define_expand "doloop_end"
[(use (match_operand 0 "" "")) ; loop pseudo
- (use (match_operand 1 "" ""))] ; label
+ (use (match_operand 1 "" "")) ; label
+ (use (match_operand 2 "" ""))] ; decrement constant
"TARGET_V850E3V5_UP && TARGET_LOOP"
{
rtx loop_cnt = operands[0];
@@ -2016,13 +2016,14 @@
(plus:SI (match_dup 0)
(const_int -1)))
(unspec [(const_int 0)] UNSPEC_LSETUP_END)
- (clobber (match_dup 2))])] ; match_scratch
+ (clobber (match_dup 3)) ; match_scratch
+ (match_operand 2 "" "")])]
"TARGET_LOOPS && optimize"
{
/* The loop optimizer doesn't check the predicates... */
if (GET_MODE (operands[0]) != SImode)
FAIL;
- operands[2] = gen_rtx_SCRATCH (SImode);
+ operands[3] = gen_rtx_SCRATCH (SImode);
})
@@ -11774,6 +11774,14 @@ loops, and will help ivopts to make some decisions.
The default version of this hook returns false.
@end deftypefn
+@deftypefn {Target Hook} rtx TARGET_ALLOW_ELEMENTWISE_DOLOOP_P (rtx @var{count}, rtx @var{label}, rtx @var{doloop})
+This target hook allows the target to support loop-doloop optimisations
+where the value that gets put into the loop counter register is not a
+pre-calculation of the number of iteration of the loop. For instance,
+the value used can be the number of elements that the loop will process.
+The default version of this hook returns the same rtx it was given.
+@end deftypefn
+
@deftypevr {Target Hook} bool TARGET_HAVE_COUNT_REG_DECR_P
Return true if the target supports hardware count register for decrement
and branch.
@@ -7730,6 +7730,8 @@ to by @var{ce_info}.
@hook TARGET_PREDICT_DOLOOP_P
+@hook TARGET_ALLOW_ELEMENTWISE_DOLOOP_P
+
@hook TARGET_HAVE_COUNT_REG_DECR_P
@hook TARGET_DOLOOP_COST_FOR_GENERIC
@@ -85,29 +85,29 @@ doloop_condition_get (rtx_insn *doloop_pat)
forms:
1) (parallel [(set (pc) (if_then_else (condition)
- (label_ref (label))
- (pc)))
- (set (reg) (plus (reg) (const_int -1)))
- (additional clobbers and uses)])
+ (label_ref (label))
+ (pc)))
+ (set (reg) (plus (reg) (const_int -n)))
+ (additional clobbers and uses)])
The branch must be the first entry of the parallel (also required
by jump.cc), and the second entry of the parallel must be a set of
the loop counter register. Some targets (IA-64) wrap the set of
the loop counter in an if_then_else too.
- 2) (set (reg) (plus (reg) (const_int -1))
- (set (pc) (if_then_else (reg != 0)
- (label_ref (label))
- (pc))).
+ 2) (set (reg) (plus (reg) (const_int -n))
+ (set (pc) (if_then_else (reg != 0)
+ (label_ref (label))
+ (pc))).
Some targets (ARM) do the comparison before the branch, as in the
following form:
- 3) (parallel [(set (cc) (compare ((plus (reg) (const_int -1), 0)))
- (set (reg) (plus (reg) (const_int -1)))])
- (set (pc) (if_then_else (cc == NE)
- (label_ref (label))
- (pc))) */
+ 3) (parallel [(set (cc) (compare ((plus (reg) (const_int -n), 0)))
+ (set (reg) (plus (reg) (const_int -n)))])
+ (set (pc) (if_then_else (cc == NE)
+ (label_ref (label))
+ (pc))) */
pattern = PATTERN (doloop_pat);
@@ -143,7 +143,7 @@ doloop_condition_get (rtx_insn *doloop_pat)
|| GET_CODE (cmp_arg1) != PLUS)
return 0;
reg_orig = XEXP (cmp_arg1, 0);
- if (XEXP (cmp_arg1, 1) != GEN_INT (-1)
+ if (!CONST_INT_P (XEXP (cmp_arg1, 1))
|| !REG_P (reg_orig))
return 0;
cc_reg = SET_DEST (cmp_orig);
@@ -156,7 +156,8 @@ doloop_condition_get (rtx_insn *doloop_pat)
{
/* We expect the condition to be of the form (reg != 0) */
cond = XEXP (SET_SRC (cmp), 0);
- if (GET_CODE (cond) != NE || XEXP (cond, 1) != const0_rtx)
+ if ((GET_CODE (cond) != NE && GET_CODE (cond) != GE)
+ || XEXP (cond, 1) != const0_rtx)
return 0;
}
}
@@ -173,14 +174,14 @@ doloop_condition_get (rtx_insn *doloop_pat)
if (! REG_P (reg))
return 0;
- /* Check if something = (plus (reg) (const_int -1)).
+ /* Check if something = (plus (reg) (const_int -n)).
On IA-64, this decrement is wrapped in an if_then_else. */
inc_src = SET_SRC (inc);
if (GET_CODE (inc_src) == IF_THEN_ELSE)
inc_src = XEXP (inc_src, 1);
if (GET_CODE (inc_src) != PLUS
|| XEXP (inc_src, 0) != reg
- || XEXP (inc_src, 1) != constm1_rtx)
+ || !CONST_INT_P (XEXP (inc_src, 1)))
return 0;
/* Check for (set (pc) (if_then_else (condition)
@@ -211,42 +212,49 @@ doloop_condition_get (rtx_insn *doloop_pat)
|| (GET_CODE (XEXP (condition, 0)) == PLUS
&& XEXP (XEXP (condition, 0), 0) == reg))
{
- if (GET_CODE (pattern) != PARALLEL)
/* For the second form we expect:
- (set (reg) (plus (reg) (const_int -1))
- (set (pc) (if_then_else (reg != 0)
- (label_ref (label))
- (pc))).
+ (set (reg) (plus (reg) (const_int -n))
+ (set (pc) (if_then_else (reg != 0)
+ (label_ref (label))
+ (pc))).
- is equivalent to the following:
+ If n == 1, that is equivalent to the following:
- (parallel [(set (pc) (if_then_else (reg != 1)
- (label_ref (label))
- (pc)))
- (set (reg) (plus (reg) (const_int -1)))
- (additional clobbers and uses)])
+ (parallel [(set (pc) (if_then_else (reg != 1)
+ (label_ref (label))
+ (pc)))
+ (set (reg) (plus (reg) (const_int -1)))
+ (additional clobbers and uses)])
- For the third form we expect:
+ For the third form we expect:
- (parallel [(set (cc) (compare ((plus (reg) (const_int -1)), 0))
- (set (reg) (plus (reg) (const_int -1)))])
- (set (pc) (if_then_else (cc == NE)
- (label_ref (label))
- (pc)))
+ (parallel [(set (cc) (compare ((plus (reg) (const_int -n)), 0))
+ (set (reg) (plus (reg) (const_int -n)))])
+ (set (pc) (if_then_else (cc == NE)
+ (label_ref (label))
+ (pc)))
- which is equivalent to the following:
+ Which also for n == 1 is equivalent to the following:
- (parallel [(set (cc) (compare (reg, 1))
- (set (reg) (plus (reg) (const_int -1)))
- (set (pc) (if_then_else (NE == cc)
- (label_ref (label))
- (pc))))])
+ (parallel [(set (cc) (compare (reg, 1))
+ (set (reg) (plus (reg) (const_int -1)))
+ (set (pc) (if_then_else (NE == cc)
+ (label_ref (label))
+ (pc))))])
- So we return the second form instead for the two cases.
+ So we return the second form instead for the two cases.
+ For the "elementwise" form where the decrement number isn't -1,
+ the final value may be exceeded, so use GE instead of NE.
*/
- condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);
+ if (GET_CODE (pattern) != PARALLEL)
+ {
+ if (INTVAL (XEXP (inc_src, 1)) != -1)
+ condition = gen_rtx_fmt_ee (GE, VOIDmode, inc_src, const0_rtx);
+ else
+ condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);;
+ }
return condition;
}
@@ -685,17 +693,6 @@ doloop_optimize (class loop *loop)
return false;
}
- max_cost
- = COSTS_N_INSNS (param_max_iterations_computation_cost);
- if (set_src_cost (desc->niter_expr, mode, optimize_loop_for_speed_p (loop))
- > max_cost)
- {
- if (dump_file)
- fprintf (dump_file,
- "Doloop: number of iterations too costly to compute.\n");
- return false;
- }
-
if (desc->const_iter)
iterations = widest_int::from (rtx_mode_t (desc->niter_expr, mode),
UNSIGNED);
@@ -720,7 +717,25 @@ doloop_optimize (class loop *loop)
count = copy_rtx (desc->niter_expr);
start_label = block_label (desc->in_edge->dest);
doloop_reg = gen_reg_rtx (mode);
- rtx_insn *doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label);
+ rtx_insn *doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label,
+ count);
+
+ /* Not all targets need to pre-calculate the number of the iterations of
+ the loop, they instead work by storing the number of elements in the
+ counter_reg and decrementing that. Call the appropriate target hook to
+ change the value of count. */
+ count = targetm.allow_elementwise_doloop_p (count, start_label, doloop_seq);
+
+ max_cost
+ = COSTS_N_INSNS (param_max_iterations_computation_cost);
+ if (set_src_cost (count, mode, optimize_loop_for_speed_p (loop))
+ > max_cost)
+ {
+ if (dump_file)
+ fprintf (dump_file,
+ "Doloop: number of iterations too costly to compute.\n");
+ return false;
+ }
word_mode_size = GET_MODE_PRECISION (word_mode);
word_mode_max = (HOST_WIDE_INT_1U << (word_mode_size - 1) << 1) - 1;
@@ -737,7 +752,7 @@ doloop_optimize (class loop *loop)
else
count = lowpart_subreg (word_mode, count, mode);
PUT_MODE (doloop_reg, word_mode);
- doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label);
+ doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label, count);
}
if (! doloop_seq)
{
@@ -48,7 +48,7 @@ DEF_TARGET_INSN (casesi, (rtx x0, rtx x1, rtx x2, rtx x3, rtx x4))
DEF_TARGET_INSN (check_stack, (rtx x0))
DEF_TARGET_INSN (clear_cache, (rtx x0, rtx x1))
DEF_TARGET_INSN (doloop_begin, (rtx x0, rtx x1))
-DEF_TARGET_INSN (doloop_end, (rtx x0, rtx x1))
+DEF_TARGET_INSN (doloop_end, (rtx x0, rtx x1, rtx x2))
DEF_TARGET_INSN (eh_return, (rtx x0))
DEF_TARGET_INSN (epilogue, (void))
DEF_TARGET_INSN (exception_receiver, (void))
@@ -4392,6 +4392,16 @@ The default version of this hook returns false.",
bool, (class loop *loop),
default_predict_doloop_p)
+DEFHOOK
+(allow_elementwise_doloop_p,
+ "This target hook allows the target to support loop-doloop optimisations\n\
+where the value that gets put into the loop counter register is not a\n\
+pre-calculation of the number of iteration of the loop. For instance,\n\
+the value used can be the number of elements that the loop will process.\n\
+The default version of this hook returns the same rtx it was given.",
+ rtx, (rtx count, rtx label, rtx doloop),
+ default_allow_elementwise_doloop_p)
+
DEFHOOKPOD
(have_count_reg_decr_p,
"Return true if the target supports hardware count register for decrement\n\
@@ -661,6 +661,12 @@ default_predict_doloop_p (class loop *loop ATTRIBUTE_UNUSED)
return false;
}
+rtx
+default_allow_elementwise_doloop_p (rtx count, rtx, rtx)
+{
+ return count;
+}
+
/* By default, just use the input MODE itself. */
machine_mode
@@ -88,6 +88,7 @@ extern bool default_fixed_point_supported_p (void);
extern bool default_has_ifunc_p (void);
extern bool default_predict_doloop_p (class loop *);
+extern rtx default_allow_elementwise_doloop_p (rtx, rtx, rtx);
extern machine_mode default_preferred_doloop_mode (machine_mode);
extern const char * default_invalid_within_doloop (const rtx_insn *);
new file mode 100644
@@ -0,0 +1,68 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "lob.h"
+
+void __attribute__ ((noinline)) test (int16_t *a, int16_t *b, int16_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp16q (n);
+ int16x8_t va = vldrhq_z_s16 (a, p);
+ int16x8_t vb = vldrhq_z_s16 (b, p);
+ int16x8_t vc = vaddq_x_s16 (va, vb, p);
+ vstrhq_p_s16 (c, vc, p);
+ c+=8;
+ a+=8;
+ b+=8;
+ n-=8;
+ }
+}
+
+int main ()
+{
+ int i;
+ int16_t temp1[N];
+ int16_t temp2[N];
+ int16_t temp3[N];
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 0);
+ check_plus16 (temp1, temp2, temp3, 0);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 1);
+ check_plus16 (temp1, temp2, temp3, 1);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 7);
+ check_plus16 (temp1, temp2, temp3, 7);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 8);
+ check_plus16 (temp1, temp2, temp3, 8);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 9);
+ check_plus16 (temp1, temp2, temp3, 9);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 16);
+ check_plus16 (temp1, temp2, temp3, 16);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 17);
+ check_plus16 (temp1, temp2, temp3, 17);
+
+ reset_data16 (temp1, temp2, temp3, N);
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.16\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp\t} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp\t" } } */
+/* { dg-final { scan-assembler-not "\tvpst\t" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
new file mode 100644
@@ -0,0 +1,68 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "lob.h"
+
+void __attribute__ ((noinline)) test (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c+=4;
+ a+=4;
+ b+=4;
+ n-=4;
+ }
+}
+
+int main ()
+{
+ int i;
+ int32_t temp1[N];
+ int32_t temp2[N];
+ int32_t temp3[N];
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 0);
+ check_plus32 (temp1, temp2, temp3, 0);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 1);
+ check_plus32 (temp1, temp2, temp3, 1);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 3);
+ check_plus32 (temp1, temp2, temp3, 3);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 4);
+ check_plus32 (temp1, temp2, temp3, 4);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 5);
+ check_plus32 (temp1, temp2, temp3, 5);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 8);
+ check_plus32 (temp1, temp2, temp3, 8);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 9);
+ check_plus32 (temp1, temp2, temp3, 9);
+
+ reset_data32 (temp1, temp2, temp3, N);
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.32\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp\t} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp\t" } } */
+/* { dg-final { scan-assembler-not "\tvpst\t" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
new file mode 100644
@@ -0,0 +1,68 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "lob.h"
+
+void __attribute__ ((noinline)) test (int64_t *a, int64_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp64q (n);
+ int64x2_t va = vldrdq_gather_offset_z_s64 (a, vcreateq_u64 (8, 0), p);
+ vstrdq_scatter_offset_p_s64 (c, vcreateq_u64 (8, 0), va, p);
+ c+=2;
+ a+=2;
+ n-=2;
+ }
+}
+
+int main ()
+{
+ int i;
+ int64_t temp1[N];
+ int64_t temp3[N];
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 0);
+ check_memcpy64 (temp1, temp3, 0);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 1);
+ check_memcpy64 (temp1, temp3, 1);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 2);
+ check_memcpy64 (temp1, temp3, 2);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 3);
+ check_memcpy64 (temp1, temp3, 3);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 4);
+ check_memcpy64 (temp1, temp3, 4);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 5);
+ check_memcpy64 (temp1, temp3, 5);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 6);
+ check_memcpy64 (temp1, temp3, 6);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 7);
+ check_memcpy64 (temp1, temp3, 7);
+
+ reset_data64 (temp1, temp3, N);
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.64\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp\t} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp\t" } } */
+/* { dg-final { scan-assembler-not "\tvpst\t" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
new file mode 100644
@@ -0,0 +1,68 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "lob.h"
+
+void __attribute__ ((noinline)) test (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ int8x16_t va = vldrbq_z_s8 (a, p);
+ int8x16_t vb = vldrbq_z_s8 (b, p);
+ int8x16_t vc = vaddq_x_s8 (va, vb, p);
+ vstrbq_p_s8 (c, vc, p);
+ c+=16;
+ a+=16;
+ b+=16;
+ n-=16;
+ }
+}
+
+int main ()
+{
+ int i;
+ int8_t temp1[N];
+ int8_t temp2[N];
+ int8_t temp3[N];
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 0);
+ check_plus8 (temp1, temp2, temp3, 0);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 1);
+ check_plus8 (temp1, temp2, temp3, 1);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 15);
+ check_plus8 (temp1, temp2, temp3, 15);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 16);
+ check_plus8 (temp1, temp2, temp3, 16);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 17);
+ check_plus8 (temp1, temp2, temp3, 17);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 32);
+ check_plus8 (temp1, temp2, temp3, 32);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 33);
+ check_plus8 (temp1, temp2, temp3, 33);
+
+ reset_data8 (temp1, temp2, temp3, N);
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.8\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp\t} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp\t" } } */
+/* { dg-final { scan-assembler-not "\tvpst\t" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
@@ -1,15 +1,131 @@
#include <string.h>
-
+#include <stdint.h>
/* Common code for lob tests. */
#define NO_LOB asm volatile ("@ clobber lr" : : : "lr" )
-#define N 10000
+#define N 100
+
+static void
+reset_data (int *a, int *b, int *c, int x)
+{
+ memset (a, -1, x * sizeof (*a));
+ memset (b, -1, x * sizeof (*b));
+ memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data8 (int8_t *a, int8_t *b, int8_t *c, int x)
+{
+ memset (a, -1, x * sizeof (*a));
+ memset (b, -1, x * sizeof (*b));
+ memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data16 (int16_t *a, int16_t *b, int16_t *c, int x)
+{
+ memset (a, -1, x * sizeof (*a));
+ memset (b, -1, x * sizeof (*b));
+ memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data32 (int32_t *a, int32_t *b, int32_t *c, int x)
+{
+ memset (a, -1, x * sizeof (*a));
+ memset (b, -1, x * sizeof (*b));
+ memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data64 (int64_t *a, int64_t *c, int x)
+{
+ memset (a, -1, x * sizeof (*a));
+ memset (c, 0, x * sizeof (*c));
+}
+
+static void
+check_plus (int *a, int *b, int *c, int x)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (i < x)
+ {
+ if (c[i] != (a[i] + b[i])) abort ();
+ }
+ else
+ {
+ if (c[i] != 0) abort ();
+ }
+ }
+}
+
+static void
+check_plus8 (int8_t *a, int8_t *b, int8_t *c, int x)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (i < x)
+ {
+ if (c[i] != (a[i] + b[i])) abort ();
+ }
+ else
+ {
+ if (c[i] != 0) abort ();
+ }
+ }
+}
+
+static void
+check_plus16 (int16_t *a, int16_t *b, int16_t *c, int x)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (i < x)
+ {
+ if (c[i] != (a[i] + b[i])) abort ();
+ }
+ else
+ {
+ if (c[i] != 0) abort ();
+ }
+ }
+}
+
+static void
+check_plus32 (int32_t *a, int32_t *b, int32_t *c, int x)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (i < x)
+ {
+ if (c[i] != (a[i] + b[i])) abort ();
+ }
+ else
+ {
+ if (c[i] != 0) abort ();
+ }
+ }
+}
static void
-reset_data (int *a, int *b, int *c)
+check_memcpy64 (int64_t *a, int64_t *c, int x)
{
- memset (a, -1, N * sizeof (*a));
- memset (b, -1, N * sizeof (*b));
- memset (c, -1, N * sizeof (*c));
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (i < x)
+ {
+ if (c[i] != a[i]) abort ();
+ }
+ else
+ {
+ if (c[i] != 0) abort ();
+ }
+ }
}
@@ -54,29 +54,18 @@ loop3 (int *a, int *b, int *c)
} while (i < N);
}
-void
-check (int *a, int *b, int *c)
-{
- for (int i = 0; i < N; i++)
- {
- NO_LOB;
- if (c[i] != a[i] + b[i])
- abort ();
- }
-}
-
int
main (void)
{
- reset_data (a, b, c);
+ reset_data (a, b, c, N);
loop1 (a, b ,c);
- check (a, b ,c);
- reset_data (a, b, c);
+ check_plus (a, b, c, N);
+ reset_data (a, b, c, N);
loop2 (a, b ,c);
- check (a, b ,c);
- reset_data (a, b, c);
+ check_plus (a, b, c, N);
+ reset_data (a, b, c, N);
loop3 (a, b ,c);
- check (a, b ,c);
+ check_plus (a, b, c, N);
return 0;
}
@@ -79,14 +79,14 @@ check (void)
int
main (void)
{
- reset_data (a1, b1, c1);
- reset_data (a2, b2, c2);
+ reset_data (a1, b1, c1, N);
+ reset_data (a2, b2, c2, N);
loop1 (a1, b1, c1);
ref1 (a2, b2, c2);
check ();
- reset_data (a1, b1, c1);
- reset_data (a2, b2, c2);
+ reset_data (a1, b1, c1, N);
+ reset_data (a2, b2, c2, N);
loop2 (a1, b1, c1);
ref2 (a2, b2, c2);
check ();