@@ -65,7 +65,7 @@ extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *);
extern bool arm_q_bit_access (void);
extern bool arm_ge_bits_access (void);
extern bool arm_target_insn_ok_for_lob (rtx);
-
+extern rtx arm_attempt_dlstp_transform (rtx);
#ifdef RTX_CODE
enum reg_class
arm_mode_base_reg_class (machine_mode);
@@ -472,6 +472,9 @@ static const struct attribute_spec arm_attribute_table[] =
#undef TARGET_SCHED_REORDER
#define TARGET_SCHED_REORDER arm_sched_reorder
+#undef TARGET_ALLOW_ELEMENTWISE_DOLOOP_P
+#define TARGET_ALLOW_ELEMENTWISE_DOLOOP_P arm_allow_elementwise_doloop_p
+
#undef TARGET_REGISTER_MOVE_COST
#define TARGET_REGISTER_MOVE_COST arm_register_move_cost
@@ -33067,7 +33070,7 @@ arm_block_set_vect (rtx dstbase,
return arm_block_set_unaligned_vect (dstbase, length, value, align);
}
-/* Expand string store operation. Firstly we try to do that by using
+/* Expand string store operation. First we try to do that by using
vectorization instructions, then try with ARM unaligned access and
double-word store if profitable. OPERANDS[0] is the destination,
OPERANDS[1] is the number of bytes, operands[2] is the value to
@@ -34395,8 +34398,563 @@ arm_target_insn_ok_for_lob (rtx insn)
return single_succ_p (bb)
&& single_pred_p (bb)
- && single_succ_edge (bb)->dest == single_pred_edge (bb)->src
- && contains_no_active_insn_p (bb);
+ && single_succ_edge (bb)->dest == single_pred_edge (bb)->src;
+}
+
+static int
+arm_mve_get_vctp_lanes (rtx x)
+{
+ if (GET_CODE (x) == SET && GET_CODE (XEXP (x, 1)) == UNSPEC
+ && XINT (XEXP (x, 1), 1) == VCTP)
+ {
+ switch (GET_MODE (XEXP (x, 1)))
+ {
+ case V16BImode:
+ return 16;
+ case V8BImode:
+ return 8;
+ case V4BImode:
+ return 4;
+ case V2QImode:
+ return 2;
+ default:
+ break;
+ }
+ }
+ return 0;
+}
+
+/* Check if an insn requires the use of the VPR_REG, if it does, return the
+ sub-rtx of the VPR_REG. The `type` argument controls whether
+ this function should:
+ * For type == 0, check all operands, including the OUT operands,
+ and return the first occurance of the VPR_REG.
+ * For type == 1, only check the input operands.
+ * For type == 2, only check the output operands.
+ (INOUT operands are considered both as input and output operands)
+*/
+static rtx
+arm_get_required_vpr_reg (rtx_insn *insn, unsigned int type = 0)
+{
+ gcc_assert (type < 3);
+ if (!NONJUMP_INSN_P (insn))
+ return NULL_RTX;
+
+ bool requires_vpr;
+ extract_constrain_insn (insn);
+ int n_operands = recog_data.n_operands;
+ if (recog_data.n_alternatives == 0)
+ return NULL_RTX;
+
+ /* Fill in recog_op_alt with information about the constraints of
+ this insn. */
+ preprocess_constraints (insn);
+
+ for (int op = 0; op < n_operands; op++)
+ {
+ requires_vpr = true;
+ if (type == 1 && (recog_data.operand_type[op] == OP_OUT
+ || recog_data.operand_type[op] == OP_INOUT))
+ continue;
+ else if (type == 2 && (recog_data.operand_type[op] == OP_IN
+ || recog_data.operand_type[op] == OP_INOUT))
+ continue;
+
+ /* Iterate through alternatives of operand "op" in recog_op_alt and
+ identify if the operand is required to be the VPR. */
+ for (int alt = 0; alt < recog_data.n_alternatives; alt++)
+ {
+ const operand_alternative *op_alt
+ = &recog_op_alt[alt * n_operands];
+ /* Fetch the reg_class for each entry and check it against the
+ * VPR_REG reg_class. */
+ if (alternative_class (op_alt, op) != VPR_REG)
+ requires_vpr = false;
+ }
+ /* If all alternatives of the insn require the VPR reg for this operand,
+ it means that either this is VPR-generating instruction, like a vctp,
+ vcmp, etc., or it is a VPT-predicated insruction. Return the subrtx
+ of the VPR reg operand. */
+ if (requires_vpr)
+ return recog_data.operand[op];
+ }
+ return NULL_RTX;
+}
+
+static rtx
+ALWAYS_INLINE ATTRIBUTE_UNUSED
+arm_get_required_vpr_reg_ret_val (rtx_insn *insn)
+{
+ return arm_get_required_vpr_reg (insn, 2);
+}
+
+static rtx
+ALWAYS_INLINE
+arm_get_required_vpr_reg_param (rtx_insn *insn)
+{
+ return arm_get_required_vpr_reg (insn, 1);
+}
+
+/* Scan the basic block of a loop body for a vctp instruction. If there is
+ at least vctp instruction, return the first rtx_insn *. */
+
+static rtx_insn *
+arm_mve_get_loop_vctp (basic_block bb)
+{
+ rtx_insn *insn = BB_HEAD (bb);
+
+ /* Now scan through all the instruction patterns and
+ pick out any MVE instructions. */
+ FOR_BB_INSNS (bb, insn)
+ if (INSN_P (insn))
+ if (arm_mve_get_vctp_lanes (PATTERN (insn)) != 0)
+ return insn;
+ return NULL;
+}
+
+/* Recursively scan through the DF chain backwards within the basic block and
+ determine if any of the USEs of the original insn (or the USEs of the insns
+ where thy were DEF-ed, etc., recursively) were affected by implicit VPT
+ predication of an MVE_VPT_UNPREDICATED_INSN_P in a dlstp/letp loop. */
+
+static bool
+arm_mve_check_df_for_implic_predic (rtx_insn *insn, rtx_insn* loop_vctp,
+ rtx vctp_vpr_generated)
+{
+ rtx insn_vpr_reg_operand = NULL_RTX;
+
+ /* Exit the recursion with a "true" when we find an unpredicated insn, or with
+ a false if we find the dlstp/letp loop vctp or a correctly dlstp/letp
+ predicated insn. */
+ if (MVE_VPT_UNPREDICATED_INSN_P (insn) && insn != loop_vctp)
+ return true;
+ else if (insn == loop_vctp
+ || ((insn_vpr_reg_operand = arm_get_required_vpr_reg_param (insn))
+ && rtx_equal_p (vctp_vpr_generated, insn_vpr_reg_operand)))
+ return false;
+
+ /* For each USE in the instruction we are called for, we will loop backwards
+ up the BB to try and find if it was DEF-ed within the dlstp/letp loop.
+ If we do find such a DEF, then recurse on it's INSN. The intention here
+ is find if any of the inputs to the current instruction were affected by
+ implicit predication by being MVE_VPT_UNPREDICATED_INSN_Ps in a dlstp/letp
+ loop. */
+ df_ref insn_uses = NULL;
+ FOR_EACH_INSN_USE (insn_uses, insn)
+ {
+ /* Starting from the current insn, scan backwards through the insn chain
+ until BB_HEAD: "for each insn in the BB prior to the current one". */
+ rtx_insn *prev_insn = NULL;
+ for (prev_insn = insn;
+ prev_insn && prev_insn != PREV_INSN (BB_HEAD (BLOCK_FOR_INSN (insn)));
+ prev_insn = PREV_INSN (prev_insn))
+ {
+ /* Look at all the DEFs of that previous insn: if one of them is on
+ the same REG as our current insn, then recurse in order to check
+ that insn's USEs. If any of these insns return true as
+ MVE_VPT_UNPREDICATED_INSN_Ps, then the whole chain is affected
+ by the change in behaviour from being placed in dlstp/letp loop. */
+ df_ref prev_insn_defs = NULL;
+ FOR_EACH_INSN_DEF (prev_insn_defs, prev_insn)
+ {
+ if (DF_REF_REGNO (insn_uses) == DF_REF_REGNO (prev_insn_defs)
+ && insn != prev_insn
+ && BLOCK_FOR_INSN (insn) == BLOCK_FOR_INSN (prev_insn)
+ && arm_mve_check_df_for_implic_predic (prev_insn,
+ loop_vctp,
+ vctp_vpr_generated))
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+/* Attempt to transform the loop contents of loop basic block from VPT
+ predicated insns into unpredicated insns for a dlstp/letp loop. */
+
+rtx
+arm_attempt_dlstp_transform (rtx label)
+{
+ int decrementnum;
+ basic_block body = BLOCK_FOR_INSN (label)->prev_bb;
+
+ /* Ensure that the bb is within a loop that has all required metadata. */
+ if (!body->loop_father || !body->loop_father->header
+ || !body->loop_father->simple_loop_desc)
+ return GEN_INT (1);
+ basic_block pre_loop_bb1 = loop_preheader_edge (body->loop_father)->src;
+ basic_block pre_loop_bb2 = pre_loop_bb1->prev_bb;
+ rtx count = simple_loop_desc (body->loop_father)->niter_expr;
+ rtx shift_expr = NULL_RTX;
+ rtx initial_compare = NULL_RTX;
+
+ /* Doloop can only be done "elementwise" with predicated dlstp/letp if it
+ contains a VCTP on the number of elements processed by the loop.
+ Find the VCTP predicate generation inside the loop body BB. */
+ rtx_insn *vctp_insn = arm_mve_get_loop_vctp (body);
+ if (!vctp_insn)
+ return GEN_INT (1);
+
+ /* Additionally, the iteration counter must only get decremented by the
+ number of MVE lanes (as per the data type).
+ There are only two types of loops that can be turned into dlstp/letp loops:
+ A) Loops of the form:
+ while (num_of_elem > 0)
+ {
+ p = vctp<size> (num_of_elem)
+ n -= num_of_lanes;
+ }
+ B) Loops of the form:
+ int num_of_iters = (num_of_elem + num_of_lanes - 1) / num_of_lanes
+ for (i = 0; i < num_of_iters; i++)
+ {
+ p = vctp<size> (num_of_elem)
+ n -= num_of_lanes;
+ }
+
+ These can be verified through the "count" variable in the middle-end
+ (a.k.a. get_simple_loop_desc (loop)->desc->niter_expr). This is the
+ expression used to calculate the number of iterations that the loop would
+ execute for a standard dls/le loop.
+
+ For dlstp/letp we only support cases where this is a power of 2, so from
+ "count" we want to extract something like:
+ ( [l/a] shiftrt: (x) (const_int y))
+ For loops of form A), "count" is already a shiftrt expression.
+ For loops of form B), "count" gets given as:
+ (plus: (not (i)) (num_of_iters))
+ with setup happening in a previous basic block. Here we need to verify:
+ * That i is _always_ initialized to (const_int 0)
+ * That num_of_iters is a shiftrt expression.
+ */
+ if (GET_CODE (count) == LSHIFTRT
+ || GET_CODE (count) == ASHIFTRT)
+ {
+ shift_expr = count;
+ /* In this situation where we are looping on a decreasing number of
+ elements, a dlstp/letp loop can only work if the looping ends when the
+ element counter reaches zero and not some other value (e.g. n > 0
+ works, not n > 1), or we can incorrectly end up running one additional
+ iteration. To by-pass any hoisting that the compiler may have done
+ with the first arg to `count`, we can instead look at the bb before
+ the loop preheader: this should end with a cmp+jump pair, where the
+ cmp needs to be: (const_int 0). */
+ if (!pre_loop_bb2 || !BB_END (pre_loop_bb2)
+ || !prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb2))
+ || !INSN_P (prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb2))))
+ return GEN_INT (1);
+ else
+ initial_compare
+ = PATTERN (prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb2)));
+
+ if (!(initial_compare && GET_CODE (initial_compare) == SET
+ && cc_register (XEXP (initial_compare, 0), VOIDmode)
+ && GET_CODE (XEXP (initial_compare, 1)) == COMPARE
+ && CONST_INT_P (XEXP (XEXP (initial_compare, 1), 1))
+ && INTVAL (XEXP (XEXP (initial_compare, 1), 1)) == 0))
+ return GEN_INT (1);
+ }
+ else if (GET_CODE (count) == PLUS)
+ {
+ if (!(GET_CODE (XEXP (count, 0)) == NOT
+ && REG_P (XEXP (XEXP (count, 0), 0)) && REG_P (XEXP (count, 1))))
+ return GEN_INT (1);
+ else
+ {
+ /* Verify the first argument to the plus. */
+ rtx loop_counter = XEXP (XEXP (count, 0), 0);
+ df_ref loop_counter_init_def = NULL;
+ loop_counter_init_def
+ = df_bb_regno_last_def_find (pre_loop_bb1, REGNO (loop_counter));
+ rtx loop_counter_init
+ = PATTERN (DF_REF_INSN (loop_counter_init_def));
+ if (!(loop_counter_init_def && GET_CODE (loop_counter_init) == SET
+ && CONST_INT_P (XEXP (loop_counter_init, 1))
+ && INTVAL (XEXP (loop_counter_init, 1)) == 0))
+ return GEN_INT (1);
+
+ /* Verify the second argument to the plus. */
+ rtx count_max = XEXP (count, 1);
+ df_ref counter_max_def = NULL;
+ counter_max_def = DF_REG_DEF_CHAIN (REGNO (count_max));
+ if (counter_max_def
+ && (GET_CODE (XEXP (single_set (DF_REF_INSN (counter_max_def)),
+ 1))
+ == LSHIFTRT
+ || GET_CODE (
+ XEXP (single_set (DF_REF_INSN (counter_max_def)), 1))
+ == ASHIFTRT))
+ {
+ shift_expr
+ = XEXP (single_set (DF_REF_INSN (counter_max_def)), 1);
+ }
+ else
+ return GEN_INT (1);
+ }
+ }
+ else
+ return GEN_INT (1);
+
+ /* Check the validity of the shift: the second operand needs to be a
+ constant. */
+ if (!CONSTANT_P (XEXP (shift_expr, 1)))
+ return GEN_INT (1);
+ /* Extract the loop decrement from the [A/L]SHIFTR 2nd operand of count. */
+ decrementnum = (1 << (INTVAL (XEXP (shift_expr, 1))));
+ /* Ensure it matches the number of lanes of the vctp instruction. */
+ if (decrementnum != arm_mve_get_vctp_lanes (PATTERN (vctp_insn)))
+ return GEN_INT (1);
+
+ rtx_insn *insn = 0;
+ rtx_insn *cur_insn = 0;
+ rtx_insn *seq;
+ rtx vctp_vpr_generated = NULL_RTX;
+
+ /* Scan through the insns in the loop bb and emit the transformed bb
+ insns to a sequence. */
+ start_sequence ();
+ FOR_BB_INSNS (body, insn)
+ {
+ rtx insn_vpr_reg_operand = NULL_RTX;
+ if (GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn))
+ continue;
+ else if (NOTE_P (insn))
+ emit_note ((enum insn_note) NOTE_KIND (insn));
+ else if (DEBUG_INSN_P (insn))
+ emit_debug_insn (PATTERN (insn));
+ else if (!INSN_P (insn))
+ {
+ end_sequence ();
+ return GEN_INT (1);
+ }
+ /* When we find the vctp instruction: This may be followed by
+ a zero-extend insn to SImode. If it is, then save the
+ zero-extended REG into vctp_vpr_generated. If there is no
+ zero-extend, then store the raw output of the vctp.
+ For any VPT-predicated instructions we need to ensure that
+ the VPR they use is the same as the one given here and
+ they often consume the output of a subreg of the SImode
+ zero-extended VPR-reg. As a result, comparing against the
+ output of the zero-extend is more likely to succeed.
+ This code also guarantees to us that the vctp comes before
+ any instructions that use the VPR within the loop, for the
+ dlstp/letp transform to succeed. */
+ else if (insn == vctp_insn)
+ {
+ rtx_insn *next_use1 = NULL;
+ df_ref use;
+ for (use = DF_REG_USE_CHAIN (
+ DF_REF_REGNO (DF_INSN_INFO_DEFS (DF_INSN_INFO_GET (insn))));
+ use; use = DF_REF_NEXT_REG (use))
+ if (!next_use1 && NONDEBUG_INSN_P (DF_REF_INSN (use)))
+ next_use1 = DF_REF_INSN (use);
+
+ if (GET_CODE (SET_SRC (single_set (next_use1))) == ZERO_EXTEND)
+ {
+ rtx_insn *next_use2 = NULL;
+ for (use = DF_REG_USE_CHAIN (DF_REF_REGNO (
+ DF_INSN_INFO_DEFS (DF_INSN_INFO_GET (next_use1))));
+ use; use = DF_REF_NEXT_REG (use))
+ if (!next_use2 && NONDEBUG_INSN_P (DF_REF_INSN (use)))
+ next_use2 = DF_REF_INSN (use);
+
+ if (GET_CODE (SET_SRC (single_set (next_use2))) == SUBREG)
+ vctp_vpr_generated = XEXP (PATTERN (next_use2), 0);
+ }
+
+ if (!vctp_vpr_generated)
+ {
+ end_sequence ();
+ return GEN_INT (1);
+ }
+ /* Also emit a USE of the source register of the vctp.
+ This holds the number of elements being processed
+ by the loop. This later gets stored into `count`
+ for the middle-end to initialise the loop counter. */
+ emit_use (XVECEXP (XEXP (PATTERN (insn), 1), 0, 0));
+ continue;
+ }
+ /* If the insn pattern requires the use of the VPR value from the
+ vctp as an input parameter. */
+ else if ((insn_vpr_reg_operand = arm_get_required_vpr_reg_param (insn))
+ && rtx_equal_p (vctp_vpr_generated, insn_vpr_reg_operand))
+ {
+ gcc_assert (MVE_VPT_PREDICATED_INSN_P (insn));
+ int new_icode = get_attr_mve_unpredicated_insn (insn);
+ extract_insn (insn);
+ rtx arr[8];
+ int j = 0;
+
+ /* When transforming a VPT-predicated instruction
+ into its unpredicated equivalent we need to drop
+ the VPR operand and we may need to also drop a
+ merge "vuninit" input operand, depending on the
+ instruction pattern. Here ensure that we have at
+ most a two-operand difference between the two
+ instrunctions. */
+ int n_operands_diff
+ = recog_data.n_operands - insn_data[new_icode].n_operands;
+ gcc_assert (n_operands_diff > 0 && n_operands_diff <= 2);
+
+ /* Then, loop through the operands of the predicated
+ instruction, and retain the ones that map to the
+ unpredicated instruction. */
+ for (int i = 0; i < recog_data.n_operands; i++)
+ {
+ /* Ignore the VPR and, if needed, the vuninit
+ operand. */
+ if (insn_vpr_reg_operand == recog_data.operand[i]
+ || (n_operands_diff == 2
+ && !strcmp (recog_data.constraints[i], "0")))
+ continue;
+ else
+ {
+ arr[j] = recog_data.operand[i];
+ j++;
+ }
+ }
+
+ /* Finally, emit the upredicated instruction. */
+ switch (j)
+ {
+ case 1:
+ emit_insn (GEN_FCN (new_icode) (arr[0]));
+ break;
+ case 2:
+ emit_insn (GEN_FCN (new_icode) (arr[0], arr[1]));
+ break;
+ case 3:
+ emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2]));
+ break;
+ case 4:
+ emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+ arr[3]));
+ break;
+ case 5:
+ emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2], arr[3],
+ arr[4]));
+ break;
+ case 6:
+ emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2], arr[3],
+ arr[4], arr[5]));
+ break;
+ case 7:
+ emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2], arr[3],
+ arr[4], arr[5], arr[6]));
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ }
+ /* If the insn isn't VPT predicated on vctp_vpr_generated, we need to
+ make sure that it is still valid within the dlstp/letp loop. */
+ else
+ {
+ /* None of registers USE-d by the instruction need can be the VPR
+ vctp_vpr_generated. This blocks the optimisation if there any
+ instructions that use the optimised-out VPR value in any way
+ other than as a VPT block predicate. */
+ df_ref insn_uses = NULL;
+ FOR_EACH_INSN_USE (insn_uses, insn)
+ {
+ if (rtx_equal_p (vctp_vpr_generated, DF_REF_REG (insn_uses)))
+ {
+ end_sequence ();
+ return GEN_INT (1);
+ }
+ }
+ /* If within the loop we have an MVE vector instruction that is
+ unpredicated, the dlstp/letp looping will add implicit
+ predication to it. This will result in a change in behaviour
+ of the instruction, so we need to find out if any instructions
+ that feed into the current instruction were implicitly
+ predicated. */
+ if (arm_mve_check_df_for_implic_predic (insn, vctp_insn,
+ vctp_vpr_generated))
+ {
+ if (mve_memory_operand (SET_DEST (single_set (insn)),
+ GET_MODE (SET_DEST (single_set (insn)))))
+ {
+ end_sequence ();
+ return GEN_INT (1);
+ }
+
+ /* Next, if we have identified that the current DEF will be
+ modified by such implicit predication, scan through all the
+ insns that USE it and bail out if any one is outside the
+ current basic block. */
+ df_ref insn_def = NULL;
+ insn_def = DF_INSN_INFO_DEFS (DF_INSN_INFO_GET (insn));
+ if (insn_def)
+ {
+ for (df_ref use = DF_REG_USE_CHAIN (DF_REF_REGNO (insn_def));
+ use; use = DF_REF_NEXT_REG (use))
+ {
+ rtx_insn *next_use_insn = DF_REF_INSN (use);
+ if (NONDEBUG_INSN_P (next_use_insn))
+ {
+ rtx next_insn_set_dest
+ = SET_DEST (single_set (next_use_insn));
+ if (BLOCK_FOR_INSN (insn)
+ != BLOCK_FOR_INSN (next_use_insn))
+ {
+ end_sequence ();
+ return GEN_INT (1);
+ }
+ }
+ }
+ }
+ }
+ emit_insn (PATTERN (insn));
+ }
+ }
+ seq = get_insns ();
+ end_sequence ();
+
+ /* Re-write the entire BB contents with the transformed
+ sequence. */
+ FOR_BB_INSNS_SAFE (body, insn, cur_insn)
+ if (!(GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn)))
+ delete_insn (insn);
+ for (insn = seq; NEXT_INSN (insn); insn = NEXT_INSN (insn))
+ if (NOTE_P (insn))
+ emit_note_after ((enum insn_note)NOTE_KIND (insn), BB_END (body));
+ else if (DEBUG_INSN_P (insn))
+ emit_debug_insn_after (PATTERN (insn), BB_END (body));
+ else
+ emit_insn_after (PATTERN (insn), BB_END (body));
+
+ emit_jump_insn_after (PATTERN (insn), BB_END (body));
+ return GEN_INT (decrementnum);
+}
+
+/* Target hook to the number of elements to be processed by a dlstp/letp loop
+ into `count` to intialise the counter register. The number of elements was
+ previously extracted from the vctp insn and placed into a USE rtx.
+ We only check that the doloop_end pattern successfully decrements by a
+ number other than -1 for a valid dlstp/letp loop. No other checking is
+ needed as that was done previously. */
+
+rtx
+arm_allow_elementwise_doloop_p (rtx count, rtx label, rtx doloop)
+{
+ if (doloop
+ && INTVAL (XEXP (SET_SRC (XVECEXP (PATTERN (doloop), 0, 1)), 1)) != -1)
+ {
+ basic_block body = BLOCK_FOR_INSN (label)->prev_bb;
+ rtx_insn* insn;
+ FOR_BB_INSNS (body, insn)
+ {
+ if (INSN_P (insn) && GET_CODE (PATTERN (insn)) == USE)
+ {
+ rtx num_elem_reg = copy_rtx (XEXP (PATTERN (insn), 0));
+ delete_insn (insn);
+ return num_elem_reg;
+ }
+ }
+ }
+ return count;
}
#if CHECKING_P
@@ -1464,6 +1464,10 @@
(VADCIQ_M_S "s") (SQRSHRL_64 "64") (SQRSHRL_48 "48")
(UQRSHLL_64 "64") (UQRSHLL_48 "48") (VSHLCQ_M_S "s")
(VSHLCQ_M_U "u")])
+
+(define_int_attr mode1 [(DLSTP8 "8") (DLSTP16 "16") (DLSTP32 "32")
+ (DLSTP64 "64")])
+
;; Both kinds of return insn.
(define_code_iterator RETURNS [return simple_return])
(define_code_attr return_str [(return "") (simple_return "simple_")])
@@ -1769,6 +1773,8 @@
(define_int_iterator UQRSHLLQ [UQRSHLL_64 UQRSHLL_48])
(define_int_iterator SQRSHRLQ [SQRSHRL_64 SQRSHRL_48])
(define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U])
+(define_int_iterator DLSTP [DLSTP8 DLSTP16 DLSTP32
+ DLSTP64])
;; Define iterators for VCMLA operations
(define_int_iterator VCMLA_OP [UNSPEC_VCMLA
@@ -11100,3 +11100,38 @@
}
DONE;
})
+
+;; Originally expanded by 'predicated_doloop_end'.
+(define_insn "*predicated_doloop_end_internal"
+ [(set (pc)
+ (if_then_else
+ (ge (plus:SI (reg:SI LR_REGNUM)
+ (match_operand:SI 0 "const_int_operand" ""))
+ (const_int 0))
+ (label_ref (match_operand 1 "" ""))
+ (pc)))
+ (set (reg:SI LR_REGNUM)
+ (plus:SI (reg:SI LR_REGNUM) (match_dup 0)))
+ (clobber (reg:CC CC_REGNUM))]
+ "TARGET_32BIT && TARGET_HAVE_LOB && TARGET_HAVE_MVE && TARGET_THUMB2"
+ {
+ if (get_attr_length (insn) == 4)
+ return "letp\t%|lr, %l1";
+ else
+ return "subs\t%|lr, #%0;bgt\t%l1";
+ }
+ [(set (attr "length")
+ (if_then_else
+ (ltu (minus (pc) (match_dup 1)) (const_int 1024))
+ (const_int 4)
+ (const_int 6)))
+ (set_attr "type" "branch")])
+
+(define_insn "dlstp<mode1>_insn"
+ [
+ (set (reg:SI LR_REGNUM)
+ (unspec:SI [(match_operand:SI 0 "s_register_operand" "r")]
+ DLSTP))
+ ]
+ "TARGET_32BIT && TARGET_HAVE_LOB && TARGET_HAVE_MVE && TARGET_THUMB2"
+ "dlstp.<mode1>\t%|lr, %0")
\ No newline at end of file
@@ -1613,7 +1613,7 @@
(use (match_operand 1 "" ""))] ; label
"TARGET_32BIT"
"
- {
+{
/* Currently SMS relies on the do-loop pattern to recognize loops
where (1) the control part consists of all insns defining and/or
using a certain 'count' register and (2) the loop count can be
@@ -1623,41 +1623,67 @@
Also used to implement the low over head loops feature, which is part of
the Armv8.1-M Mainline Low Overhead Branch (LOB) extension. */
- if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB))
- {
- rtx s0;
- rtx bcomp;
- rtx loc_ref;
- rtx cc_reg;
- rtx insn;
- rtx cmp;
-
- if (GET_MODE (operands[0]) != SImode)
- FAIL;
-
- s0 = operands [0];
-
- /* Low over head loop instructions require the first operand to be LR. */
- if (TARGET_HAVE_LOB && arm_target_insn_ok_for_lob (operands [1]))
- s0 = gen_rtx_REG (SImode, LR_REGNUM);
-
- if (TARGET_THUMB2)
- insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, GEN_INT (-1)));
- else
- insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1)));
-
- cmp = XVECEXP (PATTERN (insn), 0, 0);
- cc_reg = SET_DEST (cmp);
- bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
- loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands [1]);
- emit_jump_insn (gen_rtx_SET (pc_rtx,
- gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
- loc_ref, pc_rtx)));
- DONE;
- }
- else
- FAIL;
- }")
+ if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB))
+ {
+ rtx s0;
+ rtx bcomp;
+ rtx loc_ref;
+ rtx cc_reg;
+ rtx insn;
+ rtx cmp;
+ rtx decrement_num;
+
+ if (GET_MODE (operands[0]) != SImode)
+ FAIL;
+
+ s0 = operands[0];
+
+ if (TARGET_HAVE_LOB && arm_target_insn_ok_for_lob (operands[1]))
+ {
+ s0 = gen_rtx_REG (SImode, LR_REGNUM);
+
+ /* If we have a compatibe MVE target, try and analyse the loop
+ contents to determine if we can use predicated dlstp/letp
+ looping. */
+ if (TARGET_HAVE_MVE && TARGET_THUMB2
+ && (decrement_num = arm_attempt_dlstp_transform (operands[1]))
+ && (INTVAL (decrement_num) != 1))
+ {
+ insn = emit_insn
+ (gen_thumb2_addsi3_compare0
+ (s0, s0, GEN_INT ((-1) * (INTVAL (decrement_num)))));
+ cmp = XVECEXP (PATTERN (insn), 0, 0);
+ cc_reg = SET_DEST (cmp);
+ bcomp = gen_rtx_GE (VOIDmode, cc_reg, const0_rtx);
+ loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]);
+ emit_jump_insn (gen_rtx_SET (pc_rtx,
+ gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
+ loc_ref, pc_rtx)));
+ DONE;
+ }
+
+ /* Otherwise, try standard decrement-by-one dls/le looping. */
+ if (TARGET_THUMB2)
+ insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0,
+ GEN_INT (-1)));
+ else
+ insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1)));
+
+ cmp = XVECEXP (PATTERN (insn), 0, 0);
+ cc_reg = SET_DEST (cmp);
+ bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
+ loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]);
+ emit_jump_insn (gen_rtx_SET (pc_rtx,
+ gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
+ loc_ref, pc_rtx)));
+ DONE;
+ }
+ else
+ FAIL;
+ }
+ else
+ FAIL;
+}")
(define_insn "*clear_apsr"
[(unspec_volatile:SI [(const_int 0)] VUNSPEC_CLRM_APSR)
@@ -1755,7 +1781,37 @@
{
if (REGNO (operands[0]) == LR_REGNUM)
{
- emit_insn (gen_dls_insn (operands[0]));
+ /* Pick out the number by which we are decrementing the loop counter
+ in every iteration. If it's > 1, then use dlstp. */
+ int const_int_dec_num
+ = abs (INTVAL (XEXP (XEXP (XVECEXP (PATTERN (operands[1]), 0, 1),
+ 1),
+ 1)));
+ switch (const_int_dec_num)
+ {
+ case 16:
+ emit_insn (gen_dlstp8_insn (operands[0]));
+ break;
+
+ case 8:
+ emit_insn (gen_dlstp16_insn (operands[0]));
+ break;
+
+ case 4:
+ emit_insn (gen_dlstp32_insn (operands[0]));
+ break;
+
+ case 2:
+ emit_insn (gen_dlstp64_insn (operands[0]));
+ break;
+
+ case 1:
+ emit_insn (gen_dls_insn (operands[0]));
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
DONE;
}
else
@@ -581,6 +581,10 @@
VADDLVQ_U
VCTP
VCTP_M
+ DLSTP8
+ DLSTP16
+ DLSTP32
+ DLSTP64
VPNOT
VCREATEQ_F
VCVTQ_N_TO_F_S
@@ -11796,6 +11796,14 @@ loops, and will help ivopts to make some decisions.
The default version of this hook returns false.
@end deftypefn
+@deftypefn {Target Hook} rtx TARGET_ALLOW_ELEMENTWISE_DOLOOP_P (rtx @var{count}, rtx @var{label}, rtx @var{doloop})
+This target hook allows the target to support loop-doloop optimisations
+where the value that gets put into the loop counter register is not a
+pre-calculation of the number of iteration of the loop. For instance,
+the value used can be the number of elements that the loop will process.
+The default version of this hook returns the same rtx it was given.
+@end deftypefn
+
@deftypevr {Target Hook} bool TARGET_HAVE_COUNT_REG_DECR_P
Return true if the target supports hardware count register for decrement
and branch.
@@ -7732,6 +7732,8 @@ to by @var{ce_info}.
@hook TARGET_PREDICT_DOLOOP_P
+@hook TARGET_ALLOW_ELEMENTWISE_DOLOOP_P
+
@hook TARGET_HAVE_COUNT_REG_DECR_P
@hook TARGET_DOLOOP_COST_FOR_GENERIC
@@ -85,29 +85,29 @@ doloop_condition_get (rtx_insn *doloop_pat)
forms:
1) (parallel [(set (pc) (if_then_else (condition)
- (label_ref (label))
- (pc)))
- (set (reg) (plus (reg) (const_int -1)))
- (additional clobbers and uses)])
+ (label_ref (label))
+ (pc)))
+ (set (reg) (plus (reg) (const_int -n)))
+ (additional clobbers and uses)])
The branch must be the first entry of the parallel (also required
by jump.cc), and the second entry of the parallel must be a set of
the loop counter register. Some targets (IA-64) wrap the set of
the loop counter in an if_then_else too.
- 2) (set (reg) (plus (reg) (const_int -1))
- (set (pc) (if_then_else (reg != 0)
- (label_ref (label))
- (pc))).
+ 2) (set (reg) (plus (reg) (const_int -n))
+ (set (pc) (if_then_else (reg != 0)
+ (label_ref (label))
+ (pc))).
Some targets (ARM) do the comparison before the branch, as in the
following form:
- 3) (parallel [(set (cc) (compare ((plus (reg) (const_int -1), 0)))
- (set (reg) (plus (reg) (const_int -1)))])
- (set (pc) (if_then_else (cc == NE)
- (label_ref (label))
- (pc))) */
+ 3) (parallel [(set (cc) (compare ((plus (reg) (const_int -n), 0)))
+ (set (reg) (plus (reg) (const_int -n)))])
+ (set (pc) (if_then_else (cc == NE)
+ (label_ref (label))
+ (pc))) */
pattern = PATTERN (doloop_pat);
@@ -143,7 +143,7 @@ doloop_condition_get (rtx_insn *doloop_pat)
|| GET_CODE (cmp_arg1) != PLUS)
return 0;
reg_orig = XEXP (cmp_arg1, 0);
- if (XEXP (cmp_arg1, 1) != GEN_INT (-1)
+ if (!CONST_INT_P (XEXP (cmp_arg1, 1))
|| !REG_P (reg_orig))
return 0;
cc_reg = SET_DEST (cmp_orig);
@@ -156,7 +156,8 @@ doloop_condition_get (rtx_insn *doloop_pat)
{
/* We expect the condition to be of the form (reg != 0) */
cond = XEXP (SET_SRC (cmp), 0);
- if (GET_CODE (cond) != NE || XEXP (cond, 1) != const0_rtx)
+ if ((GET_CODE (cond) != NE && GET_CODE (cond) != GE)
+ || XEXP (cond, 1) != const0_rtx)
return 0;
}
}
@@ -173,14 +174,14 @@ doloop_condition_get (rtx_insn *doloop_pat)
if (! REG_P (reg))
return 0;
- /* Check if something = (plus (reg) (const_int -1)).
+ /* Check if something = (plus (reg) (const_int -n)).
On IA-64, this decrement is wrapped in an if_then_else. */
inc_src = SET_SRC (inc);
if (GET_CODE (inc_src) == IF_THEN_ELSE)
inc_src = XEXP (inc_src, 1);
if (GET_CODE (inc_src) != PLUS
|| XEXP (inc_src, 0) != reg
- || XEXP (inc_src, 1) != constm1_rtx)
+ || !CONST_INT_P (XEXP (inc_src, 1)))
return 0;
/* Check for (set (pc) (if_then_else (condition)
@@ -211,42 +212,49 @@ doloop_condition_get (rtx_insn *doloop_pat)
|| (GET_CODE (XEXP (condition, 0)) == PLUS
&& XEXP (XEXP (condition, 0), 0) == reg))
{
- if (GET_CODE (pattern) != PARALLEL)
/* For the second form we expect:
- (set (reg) (plus (reg) (const_int -1))
- (set (pc) (if_then_else (reg != 0)
- (label_ref (label))
- (pc))).
+ (set (reg) (plus (reg) (const_int -n))
+ (set (pc) (if_then_else (reg != 0)
+ (label_ref (label))
+ (pc))).
- is equivalent to the following:
+ If n == 1, that is equivalent to the following:
- (parallel [(set (pc) (if_then_else (reg != 1)
- (label_ref (label))
- (pc)))
- (set (reg) (plus (reg) (const_int -1)))
- (additional clobbers and uses)])
+ (parallel [(set (pc) (if_then_else (reg != 1)
+ (label_ref (label))
+ (pc)))
+ (set (reg) (plus (reg) (const_int -1)))
+ (additional clobbers and uses)])
- For the third form we expect:
+ For the third form we expect:
- (parallel [(set (cc) (compare ((plus (reg) (const_int -1)), 0))
- (set (reg) (plus (reg) (const_int -1)))])
- (set (pc) (if_then_else (cc == NE)
- (label_ref (label))
- (pc)))
+ (parallel [(set (cc) (compare ((plus (reg) (const_int -n)), 0))
+ (set (reg) (plus (reg) (const_int -n)))])
+ (set (pc) (if_then_else (cc == NE)
+ (label_ref (label))
+ (pc)))
- which is equivalent to the following:
+ Which also for n == 1 is equivalent to the following:
- (parallel [(set (cc) (compare (reg, 1))
- (set (reg) (plus (reg) (const_int -1)))
- (set (pc) (if_then_else (NE == cc)
- (label_ref (label))
- (pc))))])
+ (parallel [(set (cc) (compare (reg, 1))
+ (set (reg) (plus (reg) (const_int -1)))
+ (set (pc) (if_then_else (NE == cc)
+ (label_ref (label))
+ (pc))))])
- So we return the second form instead for the two cases.
+ So we return the second form instead for the two cases.
+ For the "elementwise" form where the decrement number isn't -1,
+ the final value may be exceeded, so use GE instead of NE.
*/
- condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);
+ if (GET_CODE (pattern) != PARALLEL)
+ {
+ if (INTVAL (XEXP (inc_src, 1)) != -1)
+ condition = gen_rtx_fmt_ee (GE, VOIDmode, inc_src, const0_rtx);
+ else
+ condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);;
+ }
return condition;
}
@@ -685,17 +693,6 @@ doloop_optimize (class loop *loop)
return false;
}
- max_cost
- = COSTS_N_INSNS (param_max_iterations_computation_cost);
- if (set_src_cost (desc->niter_expr, mode, optimize_loop_for_speed_p (loop))
- > max_cost)
- {
- if (dump_file)
- fprintf (dump_file,
- "Doloop: number of iterations too costly to compute.\n");
- return false;
- }
-
if (desc->const_iter)
iterations = widest_int::from (rtx_mode_t (desc->niter_expr, mode),
UNSIGNED);
@@ -722,6 +719,23 @@ doloop_optimize (class loop *loop)
doloop_reg = gen_reg_rtx (mode);
rtx_insn *doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label);
+ /* Not all targets need to pre-calculate the number of the iterations of
+ the loop, they instead work by storing the number of elements in the
+ counter_reg and decrementing that. Call the appropriate target hook to
+ change the value of count. */
+ count = targetm.allow_elementwise_doloop_p (count, start_label, doloop_seq);
+
+ max_cost
+ = COSTS_N_INSNS (param_max_iterations_computation_cost);
+ if (set_src_cost (count, mode, optimize_loop_for_speed_p (loop))
+ > max_cost)
+ {
+ if (dump_file)
+ fprintf (dump_file,
+ "Doloop: number of iterations too costly to compute.\n");
+ return false;
+ }
+
word_mode_size = GET_MODE_PRECISION (word_mode);
word_mode_max = (HOST_WIDE_INT_1U << (word_mode_size - 1) << 1) - 1;
if (! doloop_seq
@@ -4411,6 +4411,16 @@ The default version of this hook returns false.",
bool, (class loop *loop),
default_predict_doloop_p)
+DEFHOOK
+(allow_elementwise_doloop_p,
+ "This target hook allows the target to support loop-doloop optimisations\n\
+where the value that gets put into the loop counter register is not a\n\
+pre-calculation of the number of iteration of the loop. For instance,\n\
+the value used can be the number of elements that the loop will process.\n\
+The default version of this hook returns the same rtx it was given.",
+ rtx, (rtx count, rtx label, rtx doloop),
+ default_allow_elementwise_doloop_p)
+
DEFHOOKPOD
(have_count_reg_decr_p,
"Return true if the target supports hardware count register for decrement\n\
@@ -88,6 +88,7 @@ extern bool default_fixed_point_supported_p (void);
extern bool default_has_ifunc_p (void);
extern bool default_predict_doloop_p (class loop *);
+extern rtx default_allow_elementwise_doloop_p (rtx, rtx, rtx);
extern machine_mode default_preferred_doloop_mode (machine_mode);
extern const char * default_invalid_within_doloop (const rtx_insn *);
@@ -661,6 +661,12 @@ default_predict_doloop_p (class loop *loop ATTRIBUTE_UNUSED)
return false;
}
+rtx
+default_allow_elementwise_doloop_p (rtx count, rtx, rtx)
+{
+ return count;
+}
+
/* By default, just use the input MODE itself. */
machine_mode
@@ -140,10 +140,196 @@ TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vbrsrq, _m)
TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vshlq, _m)
TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vshrq, _m)
+/* Now test some more configurations. */
+
+/* Test a for loop format of decrementing to zero */
+int32_t a[] = {0, 1, 2, 3, 4, 5, 6, 7};
+void test1 (int32_t *b, int num_elems)
+{
+ for (int i = num_elems; i >= 0; i-= 4)
+ {
+ mve_pred16_t p = vctp32q (i);
+ int32x4_t va = vldrwq_z_s32 (&(a[i]), p);
+ vstrwq_p_s32 (b + i, va, p);
+ }
+}
+
+/* Iteration counter counting up to num_iter. */
+void test2 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+ int num_iter = (n + 15)/16;
+ for (int i = 0; i < num_iter; i++)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ n-=16;
+ }
+}
+
+/* Using an unpredicated arithmetic instruction within the loop. */
+void test3 (uint8_t *a, uint8_t *b, uint8_t *c, uint8_t *d, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_u8 (b);
+ uint8x16_t vc = vaddq_u8 (va, vb);
+ uint8x16_t vd = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ vstrbq_p_u8 (d, vd, p);
+ n-=16;
+ }
+}
+
+/* Using a different VPR value for one instruction in the loop. */
+void test4 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p1);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/* Generating and using a constant VPR value in the loop, with a vctp. */
+void test5 (int32_t *a, int32_t *b, int32_t *c, int n, int g)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ mve_pred16_t p1 = vctp32q (g);
+ int32x4_t vb = vldrwq_z_s32 (b, p1);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/* Generating and using a different VPR value in the loop, with a vctp. */
+void test6 (int32_t *a, int32_t *b, int32_t *c, int n, int g)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ mve_pred16_t p1 = vctp32q (g);
+ int32x4_t vb = vldrwq_z_s32 (b, p1);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ g++;
+ }
+}
+
+/* Generating and using a different VPR value in the loop, with a vctp_m. */
+void test7 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ mve_pred16_t p2 = vctp32q_m (n, p1);
+ int32x4_t vb = vldrwq_z_s32 (b, p1);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p2);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/* Generating and using a different VPR value in the loop, with a vctp_m that is tied to the base vctp VPR. */
+void test8 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ mve_pred16_t p1 = vctp32q_m (n, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p1);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p1);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/* Generating and using a different VPR value in the loop, with a vcmp. */
+void test9 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ mve_pred16_t p1 = vcmpeqq_s32 (va, vb);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p1);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/* Generating and using a different VPR value in the loop, with a vcmp_m. */
+void test10 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ mve_pred16_t p2 = vcmpeqq_m_s32 (va, vb, p1);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p2);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/* Generating and using a different VPR value in the loop, with a vcmp_m that is tied to the base vctp VPR. */
+void test11 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ mve_pred16_t p2 = vcmpeqq_m_s32 (va, vb, p1);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p2);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
/* The final number of DLSTPs currently is calculated by the number of
- `TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY.*` macros * 6. */
-/* { dg-final { scan-assembler-times {\tdlstp} 144 } } */
-/* { dg-final { scan-assembler-times {\tletp} 144 } } */
-/* { dg-final { scan-assembler-not "\tvctp\t" } } */
-/* { dg-final { scan-assembler-not "\tvpst\t" } } */
-/* { dg-final { scan-assembler-not "p0" } } */
+ `TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY.*` macros * 6 + 11. */
+/* { dg-final { scan-assembler-times {\tdlstp} 155 } } */
+/* { dg-final { scan-assembler-times {\tletp} 155 } } */
new file mode 100644
@@ -0,0 +1,68 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "lob.h"
+
+void __attribute__ ((noinline)) test (int16_t *a, int16_t *b, int16_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp16q (n);
+ int16x8_t va = vldrhq_z_s16 (a, p);
+ int16x8_t vb = vldrhq_z_s16 (b, p);
+ int16x8_t vc = vaddq_x_s16 (va, vb, p);
+ vstrhq_p_s16 (c, vc, p);
+ c+=8;
+ a+=8;
+ b+=8;
+ n-=8;
+ }
+}
+
+int main ()
+{
+ int i;
+ int16_t temp1[N];
+ int16_t temp2[N];
+ int16_t temp3[N];
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 0);
+ check_plus16 (temp1, temp2, temp3, 0);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 1);
+ check_plus16 (temp1, temp2, temp3, 1);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 7);
+ check_plus16 (temp1, temp2, temp3, 7);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 8);
+ check_plus16 (temp1, temp2, temp3, 8);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 9);
+ check_plus16 (temp1, temp2, temp3, 9);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 16);
+ check_plus16 (temp1, temp2, temp3, 16);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 17);
+ check_plus16 (temp1, temp2, temp3, 17);
+
+ reset_data16 (temp1, temp2, temp3, N);
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.16} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
new file mode 100644
@@ -0,0 +1,68 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "lob.h"
+
+void __attribute__ ((noinline)) test (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c+=4;
+ a+=4;
+ b+=4;
+ n-=4;
+ }
+}
+
+int main ()
+{
+ int i;
+ int32_t temp1[N];
+ int32_t temp2[N];
+ int32_t temp3[N];
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 0);
+ check_plus32 (temp1, temp2, temp3, 0);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 1);
+ check_plus32 (temp1, temp2, temp3, 1);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 3);
+ check_plus32 (temp1, temp2, temp3, 3);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 4);
+ check_plus32 (temp1, temp2, temp3, 4);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 5);
+ check_plus32 (temp1, temp2, temp3, 5);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 8);
+ check_plus32 (temp1, temp2, temp3, 8);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 9);
+ check_plus32 (temp1, temp2, temp3, 9);
+
+ reset_data32 (temp1, temp2, temp3, N);
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.32} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
new file mode 100644
@@ -0,0 +1,68 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "lob.h"
+
+void __attribute__ ((noinline)) test (int64_t *a, int64_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp64q (n);
+ int64x2_t va = vldrdq_gather_offset_z_s64 (a, vcreateq_u64 (8, 0), p);
+ vstrdq_scatter_offset_p_s64 (c, vcreateq_u64 (8, 0), va, p);
+ c+=2;
+ a+=2;
+ n-=2;
+ }
+}
+
+int main ()
+{
+ int i;
+ int64_t temp1[N];
+ int64_t temp3[N];
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 0);
+ check_memcpy64 (temp1, temp3, 0);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 1);
+ check_memcpy64 (temp1, temp3, 1);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 2);
+ check_memcpy64 (temp1, temp3, 2);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 3);
+ check_memcpy64 (temp1, temp3, 3);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 4);
+ check_memcpy64 (temp1, temp3, 4);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 5);
+ check_memcpy64 (temp1, temp3, 5);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 6);
+ check_memcpy64 (temp1, temp3, 6);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 7);
+ check_memcpy64 (temp1, temp3, 7);
+
+ reset_data64 (temp1, temp3, N);
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.64} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
new file mode 100644
@@ -0,0 +1,68 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "lob.h"
+
+void __attribute__ ((noinline)) test (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ int8x16_t va = vldrbq_z_s8 (a, p);
+ int8x16_t vb = vldrbq_z_s8 (b, p);
+ int8x16_t vc = vaddq_x_s8 (va, vb, p);
+ vstrbq_p_s8 (c, vc, p);
+ c+=16;
+ a+=16;
+ b+=16;
+ n-=16;
+ }
+}
+
+int main ()
+{
+ int i;
+ int8_t temp1[N];
+ int8_t temp2[N];
+ int8_t temp3[N];
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 0);
+ check_plus8 (temp1, temp2, temp3, 0);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 1);
+ check_plus8 (temp1, temp2, temp3, 1);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 15);
+ check_plus8 (temp1, temp2, temp3, 15);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 16);
+ check_plus8 (temp1, temp2, temp3, 16);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 17);
+ check_plus8 (temp1, temp2, temp3, 17);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 32);
+ check_plus8 (temp1, temp2, temp3, 32);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 33);
+ check_plus8 (temp1, temp2, temp3, 33);
+
+ reset_data8 (temp1, temp2, temp3, N);
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.8} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
new file mode 100644
@@ -0,0 +1,210 @@
+
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } } */
+/* { dg-options "-march=armv8.1-m.main+fp.dp+mve.fp -mfloat-abi=hard -mfpu=auto -O3 --save-temps" } */
+
+#include <arm_mve.h>
+
+/* Terminating on a non-zero number of elements. */
+void test1 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+ while (n > 1)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ n -= 16;
+ }
+}
+
+/* Similar, terminating on a non-zero number of elements, but in a for loop
+ format. */
+int32_t a[] = {0, 1, 2, 3, 4, 5, 6, 7};
+void test2 (int32_t *b, int num_elems)
+{
+ for (int i = num_elems; i >= 2; i-= 4)
+ {
+ mve_pred16_t p = vctp32q (i);
+ int32x4_t va = vldrwq_z_s32 (&(a[i]), p);
+ vstrwq_p_s32 (b + i, va, p);
+ }
+}
+
+/* Iteration counter counting up to num_iter, with a non-zero starting num. */
+void test3 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+ int num_iter = (n + 15)/16;
+ for (int i = 1; i < num_iter; i++)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ n -= 16;
+ }
+}
+
+/* Iteration counter counting up to num_iter, with a larger increment */
+void test4 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+ int num_iter = (n + 15)/16;
+ for (int i = 0; i < num_iter; i+=2)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ n -= 16;
+ }
+}
+
+/* Using an unpredicated store instruction within the loop. */
+void test5 (uint8_t *a, uint8_t *b, uint8_t *c, uint8_t *d, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_u8 (va, vb);
+ uint8x16_t vd = vaddq_x_u8 (va, vb, p);
+ vstrbq_u8 (d, vd);
+ n -= 16;
+ }
+}
+
+/* Using an unpredicated vcmp to generate a new predicate value in the
+ loop and then using it in a store insn. */
+void test14 (int32_t *a, int32_t *b, int32x4_t vc, int32_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ mve_pred16_t p1 = vcmpeqq_s32 (va, vc);
+ vstrwq_p_s32 (c, vc, p1);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/* Using an unpredicated store outside the loop. */
+void test6 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p);
+ vx = vaddq_u8 (vx, vc);
+ a += 16;
+ b += 16;
+ n -= 16;
+ }
+ vstrbq_u8 (c, vx);
+}
+
+/* Using an unpredicated op with a scalar output, where the result is valid
+ outside the bb. */
+uint8_t test7 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx)
+{
+ uint8_t sum = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p);
+ sum += vaddvq_u8 (vc);
+ a += 16;
+ b += 16;
+ n -= 16;
+ }
+ return sum;
+}
+
+/* Using an unpredicated op with a scalar output, then a scalar op,
+ where the result is valid outside the bb. */
+uint8_t test8 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx, int g)
+{
+ uint8_t sum = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p);
+ sum += vaddvq_u8 (vc);
+ sum += g;
+ a += 16;
+ b += 16;
+ n -= 16;
+ }
+ return sum;
+}
+
+/* Using a VPR that gets modified within the loop. */
+void test9 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ p++;
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/* Using a VPR that gets re-generated within the loop. */
+void test10 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ mve_pred16_t p = vctp32q (n);
+ while (n > 0)
+ {
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ p = vctp32q (n);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/* Using vctp32q_m instead of vctp32q. */
+void test11 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p0)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q_m (n, p0);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/* { dg-final { scan-assembler-not "\tdlstp" } } */
+/* { dg-final { scan-assembler-not "\tletp" } } */
@@ -1,15 +1,131 @@
#include <string.h>
-
+#include <stdint.h>
/* Common code for lob tests. */
#define NO_LOB asm volatile ("@ clobber lr" : : : "lr" )
-#define N 10000
+#define N 100
+
+static void
+reset_data (int *a, int *b, int *c, int x)
+{
+ memset (a, -1, x * sizeof (*a));
+ memset (b, -1, x * sizeof (*b));
+ memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data8 (int8_t *a, int8_t *b, int8_t *c, int x)
+{
+ memset (a, -1, x * sizeof (*a));
+ memset (b, -1, x * sizeof (*b));
+ memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data16 (int16_t *a, int16_t *b, int16_t *c, int x)
+{
+ memset (a, -1, x * sizeof (*a));
+ memset (b, -1, x * sizeof (*b));
+ memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data32 (int32_t *a, int32_t *b, int32_t *c, int x)
+{
+ memset (a, -1, x * sizeof (*a));
+ memset (b, -1, x * sizeof (*b));
+ memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data64 (int64_t *a, int64_t *c, int x)
+{
+ memset (a, -1, x * sizeof (*a));
+ memset (c, 0, x * sizeof (*c));
+}
+
+static void
+check_plus (int *a, int *b, int *c, int x)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (i < x)
+ {
+ if (c[i] != (a[i] + b[i])) abort ();
+ }
+ else
+ {
+ if (c[i] != 0) abort ();
+ }
+ }
+}
+
+static void
+check_plus8 (int8_t *a, int8_t *b, int8_t *c, int x)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (i < x)
+ {
+ if (c[i] != (a[i] + b[i])) abort ();
+ }
+ else
+ {
+ if (c[i] != 0) abort ();
+ }
+ }
+}
+
+static void
+check_plus16 (int16_t *a, int16_t *b, int16_t *c, int x)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (i < x)
+ {
+ if (c[i] != (a[i] + b[i])) abort ();
+ }
+ else
+ {
+ if (c[i] != 0) abort ();
+ }
+ }
+}
+
+static void
+check_plus32 (int32_t *a, int32_t *b, int32_t *c, int x)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (i < x)
+ {
+ if (c[i] != (a[i] + b[i])) abort ();
+ }
+ else
+ {
+ if (c[i] != 0) abort ();
+ }
+ }
+}
static void
-reset_data (int *a, int *b, int *c)
+check_memcpy64 (int64_t *a, int64_t *c, int x)
{
- memset (a, -1, N * sizeof (*a));
- memset (b, -1, N * sizeof (*b));
- memset (c, -1, N * sizeof (*c));
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (i < x)
+ {
+ if (c[i] != a[i]) abort ();
+ }
+ else
+ {
+ if (c[i] != 0) abort ();
+ }
+ }
}
@@ -54,29 +54,18 @@ loop3 (int *a, int *b, int *c)
} while (i < N);
}
-void
-check (int *a, int *b, int *c)
-{
- for (int i = 0; i < N; i++)
- {
- NO_LOB;
- if (c[i] != a[i] + b[i])
- abort ();
- }
-}
-
int
main (void)
{
- reset_data (a, b, c);
+ reset_data (a, b, c, N);
loop1 (a, b ,c);
- check (a, b ,c);
- reset_data (a, b, c);
+ check_plus (a, b, c, N);
+ reset_data (a, b, c, N);
loop2 (a, b ,c);
- check (a, b ,c);
- reset_data (a, b, c);
+ check_plus (a, b, c, N);
+ reset_data (a, b, c, N);
loop3 (a, b ,c);
- check (a, b ,c);
+ check_plus (a, b, c, N);
return 0;
}
@@ -79,14 +79,14 @@ check (void)
int
main (void)
{
- reset_data (a1, b1, c1);
- reset_data (a2, b2, c2);
+ reset_data (a1, b1, c1, N);
+ reset_data (a2, b2, c2, N);
loop1 (a1, b1, c1);
ref1 (a2, b2, c2);
check ();
- reset_data (a1, b1, c1);
- reset_data (a2, b2, c2);
+ reset_data (a1, b1, c1, N);
+ reset_data (a2, b2, c2, N);
loop2 (a1, b1, c1);
ref2 (a2, b2, c2);
check ();