Hi,
This v2 fixes a significant compile-time hog in the original patch
for the pass posted here:
https://gcc.gnu.org/pipermail/gcc-patches/2023-October/633355.html
Having seen a large compile-time regression when turning on the early
ldp pass, compiling 502.gcc_r from SPEC CPU 2017, I found that the
benchmark's insn-attrtab.c dominated the compile time, and moreover that
compile time for that file increased by 3.79x when enabling the early
ldp fusion pass at -O.
Running cc1 under a profiler revealed that ~44% of the entire profile
was in rtx_equal_p (inlined via cleanup_tombstones into ldp_fusion_bb).
I missed that we can skip running cleanup_tombstones entirely in the
(common) case that we never emitted a tombstone insn for a given bb.
This patch implements that optimization. This reduces the overhead for
the early ldp pass on that file to around 1%, which seems more like an
acceptable overhead for an additional pass.
Incrementally, the change is as follows:
A v2 patch for the pass is attached. Bootstrapped/regtested as a series
on aarch64-linux-gnu, OK for trunk?
Thanks,
Alex
diff --git a/gcc/config.gcc b/gcc/config.gcc
index fa192637a52..9c1a604bd5e 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -349,8 +349,8 @@ aarch64*-*-*)
c_target_objs="aarch64-c.o"
cxx_target_objs="aarch64-c.o"
d_target_objs="aarch64-d.o"
- extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o aarch64-sve-builtins-sve2.o cortex-a57-fma-steering.o aarch64-speculation.o falkor-tag-collision-avoidance.o aarch-bti-insert.o aarch64-cc-fusion.o"
- target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.cc \$(srcdir)/config/aarch64/aarch64-sve-builtins.h \$(srcdir)/config/aarch64/aarch64-sve-builtins.cc"
+ extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o aarch64-sve-builtins-sve2.o cortex-a57-fma-steering.o aarch64-speculation.o falkor-tag-collision-avoidance.o aarch-bti-insert.o aarch64-cc-fusion.o aarch64-ldp-fusion.o"
+ target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.cc \$(srcdir)/config/aarch64/aarch64-sve-builtins.h \$(srcdir)/config/aarch64/aarch64-sve-builtins.cc \$(srcdir)/config/aarch64/aarch64-ldp-fusion.cc"
target_has_targetm_common=yes
;;
alpha*-*-*)
diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc b/gcc/config/aarch64/aarch64-ldp-fusion.cc
new file mode 100644
index 00000000000..f1621c9a384
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -0,0 +1,2395 @@
+// LoadPair fusion optimization pass for AArch64.
+// Copyright (C) 2023 Free Software Foundation, Inc.
+//
+// This file is part of GCC.
+//
+// GCC is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 3, or (at your option)
+// any later version.
+//
+// GCC is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
+#define INCLUDE_ALGORITHM
+#define INCLUDE_FUNCTIONAL
+#define INCLUDE_LIST
+#define INCLUDE_TYPE_TRAITS
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "df.h"
+#include "rtl-ssa.h"
+#include "cfgcleanup.h"
+#include "tree-pass.h"
+#include "ordered-hash-map.h"
+#include "tree-dfa.h"
+#include "fold-const.h"
+#include "tree-hash-traits.h"
+#include "print-tree.h"
+
+using namespace rtl_ssa;
+
+enum
+{
+ LDP_IMM_BITS = 7,
+ LDP_IMM_MASK = (1 << LDP_IMM_BITS) - 1,
+ LDP_IMM_SIGN_BIT = (1 << (LDP_IMM_BITS - 1)),
+ LDP_MAX_IMM = LDP_IMM_SIGN_BIT - 1,
+ LDP_MIN_IMM = -LDP_MAX_IMM - 1,
+};
+
+// We pack these fields (load_p, fpsimd_p, and size) into an integer
+// (LFS) which we use as part of the key into the main hash tables.
+//
+// The idea is that we group candidates together only if they agree on
+// the fields below. Candidates that disagree on any of these
+// properties shouldn't be merged together.
+struct lfs_fields
+{
+ bool load_p;
+ bool fpsimd_p;
+ unsigned size;
+};
+
+using insn_list_t = std::list <insn_info *>;
+using insn_iter_t = insn_list_t::iterator;
+
+// A tagged union representing either an RTL-SSA def_info base or a
+// tree decl base.
+class base_info
+{
+ pointer_mux <def_info, union tree_node> mux;
+
+public:
+ base_info (tree decl) : mux (decl) {}
+ base_info (def_info *def) : mux (def) {}
+
+ bool is_reg () const { return mux.is_first (); }
+ def_info *get_reg () const
+ {
+ gcc_checking_assert (mux.is_first ());
+ return mux.known_first ();
+ }
+};
+
+// Information about the accesses at a given offset from a particular
+// base. Stored in an access_group, see below.
+struct access_record
+{
+ poly_int64 offset;
+ std::list<insn_info *> cand_insns;
+ std::list<access_record>::iterator place;
+
+ access_record (poly_int64 off) : offset (off) {}
+};
+
+// A group of accesses where adjacent accesses could be ldp/stp
+// candidates. The splay tree supports efficient insertion,
+// while the list supports efficient iteration.
+struct access_group
+{
+ splay_tree <access_record *> tree;
+ std::list<access_record> list;
+
+ template<typename Alloc>
+ inline void track (Alloc node_alloc, poly_int64 offset, insn_info *insn);
+};
+
+// Information about a potential base candidate, used in try_fuse_pair.
+// There may be zero, one, or two viable RTL bases for a given pair.
+struct base_cand
+{
+ def_info *m_def;
+
+ // FROM_INSN is -1 if the base candidate is already shared by both
+ // candidate insns. Otherwise it holds the index of the insn from
+ // which the base originated.
+ int from_insn;
+
+ // Initially: dataflow hazards that arise if we choose this base as
+ // the common base register for the pair.
+ //
+ // Later these get narrowed, taking alias hazards into account.
+ insn_info *hazards[2];
+
+ base_cand (def_info *def, int insn)
+ : m_def (def), from_insn (insn), hazards {nullptr, nullptr} {}
+
+ base_cand (def_info *def) : base_cand (def, -1) {}
+
+ bool viable () const
+ {
+ return !hazards[0] || !hazards[1] || (*hazards[0] > *hazards[1]);
+ }
+};
+
+// State used by the pass for a given basic block.
+struct ldp_bb_info
+{
+ using def_hash = nofree_ptr_hash <def_info>;
+ using expr_key_t = pair_hash <tree_operand_hash, int_hash <int, -1, -2>>;
+ using def_key_t = pair_hash <def_hash, int_hash <int, -1, -2>>;
+
+ // Map of <tree base, LFS> -> access_group.
+ ordered_hash_map <expr_key_t, access_group> expr_map;
+
+ // Map of <RTL-SSA def_info *, LFS> -> access_group.
+ ordered_hash_map <def_key_t, access_group> def_map;
+
+ static const size_t obstack_alignment = sizeof (void *);
+ bb_info *m_bb;
+
+ ldp_bb_info (bb_info *bb) : m_bb (bb), m_emitted_tombstone (false)
+ {
+ obstack_specify_allocation (&m_obstack, OBSTACK_CHUNK_SIZE,
+ obstack_alignment, obstack_chunk_alloc,
+ obstack_chunk_free);
+ }
+ ~ldp_bb_info ()
+ {
+ obstack_free (&m_obstack, nullptr);
+ }
+
+ inline void track_access (insn_info *, bool load, rtx mem);
+ inline void transform ();
+ inline void cleanup_tombstones ();
+
+private:
+ // Did we emit a tombstone insn for this bb?
+ bool m_emitted_tombstone;
+ obstack m_obstack;
+
+ inline splay_tree_node<access_record *> *node_alloc (access_record *);
+
+ template<typename Map>
+ inline void traverse_base_map (Map &map);
+ inline void transform_for_base (base_info binfo, int load_size,
+ access_group &group);
+
+ inline bool try_form_pairs (insn_list_t *, insn_list_t *,
+ bool load_p, unsigned access_size,
+ base_info binfo);
+
+ inline bool track_via_mem_expr (insn_info *, rtx mem, lfs_fields lfs);
+};
+
+splay_tree_node<access_record *> *
+ldp_bb_info::node_alloc (access_record *access)
+{
+ using T = splay_tree_node<access_record *>;
+ void *addr = obstack_alloc (&m_obstack, sizeof (T));
+ return new (addr) T (access);
+}
+
+static bool
+mem_valid_p (rtx mem)
+{
+ addr_space_t as = MEM_ADDR_SPACE (mem);
+ machine_mode mode = GET_MODE (mem);
+ rtx addr = XEXP (mem, 0);
+ return memory_address_addr_space_p (mode, addr, as);
+}
+
+static rtx
+try_adjust_address (rtx mem, machine_mode mode, poly_int64 offset)
+{
+ gcc_checking_assert (mem_valid_p (mem));
+ rtx adjusted = adjust_address_nv (mem, mode, offset);
+ return mem_valid_p (adjusted) ? adjusted : NULL_RTX;
+}
+
+static bool
+ldp_operand_mode_p (machine_mode mode)
+{
+ const bool allow_qregs
+ = !(aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS);
+
+ if (VECTOR_MODE_P (mode))
+ {
+ const auto poly_size = GET_MODE_SIZE (mode);
+ if (!poly_size.is_constant ())
+ return false;
+
+ HOST_WIDE_INT size = poly_size.to_constant ();
+ return size == 4
+ || size == 8
+ || (size == 16 && allow_qregs);
+ }
+
+ switch (mode)
+ {
+ case E_SImode:
+ case E_DImode:
+ case E_SFmode:
+ case E_SDmode:
+ case E_DFmode:
+ case E_DDmode:
+ return true;
+ case E_TFmode:
+ case E_TDmode:
+ return allow_qregs;
+ case E_TImode:
+ return allow_qregs && reload_completed;
+ default:
+ return false;
+ }
+}
+
+static int
+encode_lfs (lfs_fields fields)
+{
+ int size_log2 = exact_log2 (fields.size);
+ gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4);
+ return ((int)fields.load_p << 3)
+ | ((int)fields.fpsimd_p << 2)
+ | (size_log2 - 2);
+}
+
+static lfs_fields
+decode_lfs (int lfs)
+{
+ bool load_p = (lfs & (1 << 3));
+ bool fpsimd_p = (lfs & (1 << 2));
+ unsigned size = 1U << ((lfs & 3) + 2);
+ return { load_p, fpsimd_p, size };
+}
+
+template<typename Alloc>
+void
+access_group::track (Alloc alloc_node, poly_int64 offset, insn_info *insn)
+{
+ auto insert_before = [&](std::list<access_record>::iterator after)
+ {
+ auto it = list.emplace (after, offset);
+ it->cand_insns.push_back (insn);
+ it->place = it;
+ return &*it;
+ };
+
+ if (!list.size ())
+ {
+ auto access = insert_before (list.end ());
+ tree.insert_max_node (alloc_node (access));
+ return;
+ }
+
+ auto compare = [&](splay_tree_node<access_record *> *node)
+ {
+ return compare_sizes_for_sort (offset, node->value ()->offset);
+ };
+ auto result = tree.lookup (compare);
+ splay_tree_node<access_record *> *node = tree.root ();
+ if (result == 0)
+ node->value ()->cand_insns.push_back (insn);
+ else
+ {
+ auto it = node->value ()->place;
+ auto after = (result > 0) ? std::next (it) : it;
+ auto access = insert_before (after);
+ tree.insert_child (node, result > 0, alloc_node (access));
+ }
+}
+
+bool
+ldp_bb_info::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
+{
+ if (!MEM_EXPR (mem) || !MEM_OFFSET_KNOWN_P (mem))
+ return false;
+
+ // Punt on auto-inc accesses for now so we don't have to deal
+ // with the complexity later on. TODO: would be good to relax
+ // this eventually.
+ if (side_effects_p (XEXP (mem, 0)))
+ return false;
+
+ poly_int64 offset;
+ tree base_expr = get_addr_base_and_unit_offset (MEM_EXPR (mem),
+ &offset);
+ if (!base_expr || !DECL_P (base_expr))
+ return false;
+
+ offset += MEM_OFFSET (mem);
+
+ const machine_mode mem_mode = GET_MODE (mem);
+ const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
+
+ // Punt on misaligned offsets.
+ if (offset.coeffs[0] & (mem_size - 1))
+ return false;
+
+ const auto key = std::make_pair (base_expr, encode_lfs (lfs));
+ access_group &group = expr_map.get_or_insert (key, NULL);
+ auto alloc = [&](access_record *access) { return node_alloc (access); };
+ group.track (alloc, offset, insn);
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "[bb %u] tracking insn %d via ",
+ m_bb->index (), insn->uid ());
+ print_node_brief (dump_file, "mem expr", base_expr, 0);
+ fprintf (dump_file, " [L=%d FP=%d, %smode, off=",
+ lfs.load_p, lfs.fpsimd_p, mode_name[mem_mode]);
+ print_dec (offset, dump_file);
+ fprintf (dump_file, "]\n");
+ }
+
+ return true;
+}
+
+void
+ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
+{
+ // We can't combine volatile MEMs, so punt on these.
+ if (MEM_VOLATILE_P (mem))
+ return;
+
+ const machine_mode mem_mode = GET_MODE (mem);
+ if (!ldp_operand_mode_p (mem_mode))
+ return;
+
+ // Note ldp_operand_mode_p already rejected VL modes.
+ const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
+ const bool fpsimd_mode_p = GET_MODE_CLASS (mem_mode) != MODE_INT;
+
+ // For stores, validate the operand being stored.
+ if (!load_p)
+ {
+ rtx st_op = XEXP (PATTERN (insn->rtl ()), 1);
+ if (!register_operand (st_op, mem_mode)
+ && (fpsimd_mode_p || !aarch64_reg_or_zero (st_op, mem_mode)))
+ return;
+ }
+
+ // N.B. we only want to segregate FP/SIMD accesses from integer accesses
+ // before RA.
+ const bool fpsimd_bit_p = !reload_completed && fpsimd_mode_p;
+ const lfs_fields lfs = { load_p, fpsimd_bit_p, mem_size };
+
+ if (track_via_mem_expr (insn, mem, lfs))
+ return;
+
+ rtx addr = XEXP (mem, 0);
+
+ // TODO: handle auto-inc addressing modes. We probably want to
+ // record these separately (from the main hash table) and find pair
+ // opportunities by looking for other load/store instructions that are
+ // related by dataflow on the base register.
+ poly_int64 poly_off;
+ rtx base = strip_offset (addr, &poly_off);
+ if (!REG_P (base))
+ return;
+
+ // Punt on accesses relative to the eliminable regs: since we don't
+ // know the elimination offset pre-RA, we should postpone forming
+ // pairs on such accesses until after RA.
+ if (!reload_completed
+ && (REGNO (base) == FRAME_POINTER_REGNUM
+ || REGNO (base) == ARG_POINTER_REGNUM))
+ return;
+
+ // No ldp instructions with variable-length addressing.
+ if (!poly_off.is_constant ())
+ return;
+
+ HOST_WIDE_INT offset = poly_off.to_constant ();
+
+ // ldp/stp only handle offsets that are a multiple of the access size.
+ if (offset % mem_size != 0)
+ return;
+
+ // Normalize the offset.
+ offset /= mem_size;
+
+ // Check if the offset is in range.
+ if (offset > LDP_MAX_IMM || offset < LDP_MIN_IMM)
+ return;
+
+ // Now need to find def of base register.
+ def_info *base_def;
+ use_info *base_use = nullptr;
+ for (auto use_info : insn->uses ())
+ if (use_info->is_reg () && use_info->regno () == REGNO (base))
+ {
+ base_use = use_info;
+ break;
+ }
+
+ gcc_assert (base_use);
+ base_def = base_use->def ();
+ if (!base_def)
+ {
+ if (dump_file)
+ fprintf (dump_file,
+ "base register (regno %d) of insn %d is undefined",
+ REGNO (base), insn->uid ());
+ return;
+ }
+
+ const auto key = std::make_pair (base_def, encode_lfs (lfs));
+ access_group &group = def_map.get_or_insert (key, NULL);
+ auto alloc = [&](access_record *access) { return node_alloc (access); };
+ group.track (alloc, poly_off, insn);
+
+ if (dump_file)
+ fprintf (dump_file,
+ "[bb %u] tracking insn %d [L=%d, FP=%d, %smode, off=%d]\n",
+ m_bb->index (), insn->uid (), lfs.load_p, lfs.fpsimd_p,
+ mode_name[mem_mode], (int)poly_off.to_constant ());
+}
+
+// Dummy predicate that never ignores any insns.
+static bool no_ignore (insn_info *) { return false; }
+
+// Return the latest dataflow hazard before INSN.
+//
+// If IGNORE is non-NULL, this points to a sub-rtx which we should
+// ignore for dataflow purposes. This is needed when considering
+// changing the RTL base of an access discovered through a MEM_EXPR
+// base.
+//
+// N.B. we ignore any defs/uses of memory here as we deal with that
+// separately, making use of alias disambiguation.
+static insn_info *
+latest_hazard_before (insn_info *insn, rtx *ignore)
+{
+ insn_info *result = nullptr;
+ auto hazard = [insn, &result](insn_info *h)
+ {
+ gcc_checking_assert (*h < *insn);
+ if (!result || *h > *result)
+ result = h;
+ };
+
+ rtx pat = PATTERN (insn->rtl ());
+ auto ignore_use = [&](use_info *u)
+ {
+ if (u->is_mem ())
+ return true;
+
+ return !refers_to_regno_p (u->regno (), u->regno () + 1, pat, ignore);
+ };
+
+ // Find defs of uses in INSN (RaW).
+ for (auto use : insn->uses ())
+ if (!ignore_use (use) && use->def ())
+ hazard (use->def ()->insn ());
+
+ // Find previous defs (WaW) or previous uses (WaR) of defs in INSN.
+ for (auto def : insn->defs ())
+ {
+ if (def->is_mem ())
+ continue;
+
+ if (def->prev_def ())
+ {
+ hazard (def->prev_def ()->insn ()); // WaW
+
+ auto set = dyn_cast <set_info *> (def->prev_def ());
+ if (set && set->has_nondebug_insn_uses ())
+ for (auto use : set->reverse_nondebug_insn_uses ())
+ if (use->insn () != insn)
+ {
+ hazard (use->insn ()); // WaR
+ break;
+ }
+ }
+
+ if (!HARD_REGISTER_NUM_P (def->regno ()))
+ continue;
+
+ // Also need to check backwards for call clobbers (WaW).
+ for (auto call_group : def->ebb ()->call_clobbers ())
+ {
+ if (!call_group->clobbers (def->resource ()))
+ continue;
+
+ auto clobber_insn = prev_call_clobbers_ignoring (*call_group,
+ def->insn (),
+ no_ignore);
+ if (clobber_insn)
+ hazard (clobber_insn);
+ }
+
+ }
+
+ return result;
+}
+
+static insn_info *
+first_hazard_after (insn_info *insn, rtx *ignore)
+{
+ insn_info *result = nullptr;
+ auto hazard = [insn, &result](insn_info *h)
+ {
+ gcc_checking_assert (*h > *insn);
+ if (!result || *h < *result)
+ result = h;
+ };
+
+ rtx pat = PATTERN (insn->rtl ());
+ auto ignore_use = [&](use_info *u)
+ {
+ if (u->is_mem ())
+ return true;
+
+ return !refers_to_regno_p (u->regno (), u->regno () + 1, pat, ignore);
+ };
+
+ for (auto def : insn->defs ())
+ {
+ if (def->is_mem ())
+ continue;
+
+ if (def->next_def ())
+ hazard (def->next_def ()->insn ()); // WaW
+
+ auto set = dyn_cast <set_info *> (def);
+ if (set && set->has_nondebug_insn_uses ())
+ hazard (set->first_nondebug_insn_use ()->insn ()); // RaW
+
+ if (!HARD_REGISTER_NUM_P (def->regno ()))
+ continue;
+
+ // Also check for call clobbers of this def (WaW).
+ for (auto call_group : def->ebb ()->call_clobbers ())
+ {
+ if (!call_group->clobbers (def->resource ()))
+ continue;
+
+ auto clobber_insn = next_call_clobbers_ignoring (*call_group,
+ def->insn (),
+ no_ignore);
+ if (clobber_insn)
+ hazard (clobber_insn);
+ }
+ }
+
+ // Find any subsequent defs of uses in INSN (WaR).
+ for (auto use : insn->uses ())
+ {
+ if (ignore_use (use))
+ continue;
+
+ if (use->def ())
+ {
+ auto def = use->def ()->next_def ();
+ if (def && def->insn () == insn)
+ def = def->next_def ();
+
+ if (def)
+ hazard (def->insn ());
+ }
+
+ if (!HARD_REGISTER_NUM_P (use->regno ()))
+ continue;
+
+ // Also need to handle call clobbers of our uses (again WaR).
+ //
+ // See restrict_movement_for_uses_ignoring for why we don't
+ // need to check backwards for call clobbers.
+ for (auto call_group : use->ebb ()->call_clobbers ())
+ {
+ if (!call_group->clobbers (use->resource ()))
+ continue;
+
+ auto clobber_insn = next_call_clobbers_ignoring (*call_group,
+ use->insn (),
+ no_ignore);
+ if (clobber_insn)
+ hazard (clobber_insn);
+ }
+ }
+
+ return result;
+}
+
+
+enum change_strategy {
+ CHANGE,
+ DELETE,
+ TOMBSTONE,
+};
+
+// Given a change_strategy S, convert it to a string (for output in the
+// dump file).
+static const char *cs_to_string (change_strategy s)
+{
+#define C(x) case x: return #x
+ switch (s)
+ {
+ C (CHANGE);
+ C (DELETE);
+ C (TOMBSTONE);
+ }
+#undef C
+ gcc_unreachable ();
+}
+
+// TODO: should this live in RTL-SSA?
+static bool
+ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2)
+{
+ // If either range is empty, then their intersection is empty.
+ if (!r1 || !r2)
+ return false;
+
+ // When do they not overlap? When one range finishes before the other
+ // starts, i.e. (*r1.last < *r2.first || *r2.last < *r1.first).
+ // Inverting this, we get the below.
+ return *r1.last >= *r2.first && *r2.last >= *r1.first;
+}
+
+// Get the range of insns that def feeds.
+static insn_range_info get_def_range (def_info *def)
+{
+ insn_info *last = def->next_def ()->insn ()->prev_nondebug_insn ();
+ return { def->insn (), last };
+}
+
+// Given a def (of memory), return the downwards range within which we
+// can safely move this def.
+static insn_range_info
+def_downwards_move_range (def_info *def)
+{
+ auto range = get_def_range (def);
+
+ auto set = dyn_cast <set_info *> (def);
+ if (!set || !set->has_any_uses ())
+ return range;
+
+ auto use = set->first_nondebug_insn_use ();
+ if (use)
+ range = move_earlier_than (range, use->insn ());
+
+ return range;
+}
+
+// Given a def (of memory), return the upwards range within which we can
+// safely move this def.
+static insn_range_info
+def_upwards_move_range (def_info *def)
+{
+ def_info *prev = def->prev_def ();
+ insn_range_info range { prev->insn (), def->insn () };
+
+ auto set = dyn_cast <set_info *> (prev);
+ if (!set || !set->has_any_uses ())
+ return range;
+
+ auto use = set->last_nondebug_insn_use ();
+ if (use)
+ range = move_later_than (range, use->insn ());
+
+ return range;
+}
+
+static def_info *
+decide_stp_strategy (change_strategy strategy[2],
+ insn_info *first,
+ insn_info *second,
+ const insn_range_info &move_range)
+{
+ strategy[0] = CHANGE;
+ strategy[1] = DELETE;
+
+ unsigned viable = 0;
+ viable |= move_range.includes (first);
+ viable |= ((unsigned) move_range.includes (second)) << 1;
+
+ def_info * const defs[2] = {
+ memory_access (first->defs ()),
+ memory_access (second->defs ())
+ };
+ if (defs[0] == defs[1])
+ viable = 3; // No intervening store, either is viable.
+
+ if (!(viable & 1)
+ && ranges_overlap_p (move_range, def_downwards_move_range (defs[0])))
+ viable |= 1;
+ if (!(viable & 2)
+ && ranges_overlap_p (move_range, def_upwards_move_range (defs[1])))
+ viable |= 2;
+
+ if (viable == 2)
+ std::swap (strategy[0], strategy[1]);
+ else if (!viable)
+ // Tricky case: need to delete both accesses.
+ strategy[0] = DELETE;
+
+ for (int i = 0; i < 2; i++)
+ {
+ if (strategy[i] != DELETE)
+ continue;
+
+ // See if we can get away without a tombstone.
+ auto set = dyn_cast <set_info *> (defs[i]);
+ if (!set || !set->has_any_uses ())
+ continue; // We can indeed.
+
+ // If both sides are viable for re-purposing, and the other store's
+ // def doesn't have any uses, then we can delete the other store
+ // and re-purpose this store instead.
+ if (viable == 3)
+ {
+ gcc_assert (strategy[!i] == CHANGE);
+ auto other_set = dyn_cast <set_info *> (defs[!i]);
+ if (!other_set || !other_set->has_any_uses ())
+ {
+ strategy[i] = CHANGE;
+ strategy[!i] = DELETE;
+ break;
+ }
+ }
+
+ // Alas, we need a tombstone after all.
+ strategy[i] = TOMBSTONE;
+ }
+
+ for (int i = 0; i < 2; i++)
+ if (strategy[i] == CHANGE)
+ return defs[i];
+
+ return nullptr;
+}
+
+static GTY(()) rtx tombstone = NULL_RTX;
+
+// Generate the RTL pattern for a "tombstone"; used temporarily
+// during this pass to replace stores that are marked for deletion
+// where we can't immediately delete the store (e.g. if there are uses
+// hanging off its def of memory).
+//
+// These are deleted at the end of the pass and uses re-parented
+// appropriately at this point.
+static rtx
+gen_tombstone (void)
+{
+ if (!tombstone)
+ {
+ tombstone = gen_rtx_CLOBBER (VOIDmode,
+ gen_rtx_MEM (BLKmode,
+ gen_rtx_SCRATCH (Pmode)));
+ return tombstone;
+ }
+
+ return copy_rtx (tombstone);
+}
+
+static bool
+tombstone_insn_p (insn_info *insn)
+{
+ rtx x = tombstone ? tombstone : gen_tombstone ();
+ return rtx_equal_p (PATTERN (insn->rtl ()), x);
+}
+
+// Given a mode MODE, return a mode of the same size which we will
+// attempt to use as a canonical load/store mode.
+static machine_mode
+canonical_mode_for_mode (machine_mode mode)
+{
+ switch (GET_MODE_SIZE (mode).to_constant ())
+ {
+ case 4:
+ return E_SImode;
+ case 8:
+ return E_DImode;
+ case 16:
+ return E_V16QImode;
+ }
+
+ gcc_unreachable ();
+}
+
+// Try to convert accesses to happen in a canonical mode where possible.
+static void
+canonicalize_access_modes (rtx pats[2], bool load_p)
+{
+ rtx regs[2];
+ rtx mems[2];
+ machine_mode modes[2];
+ machine_mode canon_mode;
+
+ auto update_pat = [&](int i, rtx mem, rtx reg)
+ {
+ if (load_p)
+ pats[i] = gen_rtx_SET (reg, mem);
+ else
+ pats[i] = gen_rtx_SET (mem, reg);
+ };
+
+ for (int i = 0; i < 2; i++)
+ {
+ mems[i] = XEXP (pats[i], load_p);
+ regs[i] = XEXP (pats[i], !load_p);
+ modes[i] = GET_MODE (mems[i]);
+ }
+
+ canon_mode = canonical_mode_for_mode (modes[0]);
+ gcc_checking_assert (canon_mode == canonical_mode_for_mode (modes[1]));
+
+ if (modes[0] == canon_mode && modes[1] == canon_mode)
+ return;
+
+ // See if we can use a punning subreg to convert the register's
+ // mode, try this up front as it may not be possible.
+ rtx punned_regs[2] = {};
+ rtx adjusted_mems[2] = {};
+ for (int i = 0; i < 2; i++)
+ {
+ punned_regs[i] = lowpart_subreg (canon_mode, regs[i], modes[i]);
+ if (!punned_regs[i])
+ {
+ if (dump_file)
+ fprintf (dump_file,
+ " failed to canonicalize reg op mode: %s -> %s\n",
+ mode_name[modes[i]], mode_name[canon_mode]);
+ continue;
+ }
+
+ adjusted_mems[i] = try_adjust_address (mems[i], canon_mode, 0);
+ if (!adjusted_mems[i])
+ {
+ if (dump_file)
+ fprintf (dump_file, " failed to canonicalize mem mode: %s -> %s\n",
+ mode_name[modes[i]], mode_name[canon_mode]);
+ }
+ }
+
+ if (adjusted_mems[0] && adjusted_mems[1])
+ {
+ if (dump_file)
+ fprintf (dump_file,
+ " successfully canonicalized from (%s,%s) -> %smode\n",
+ mode_name[modes[0]],
+ mode_name[modes[1]],
+ mode_name[canon_mode]);
+
+ for (int i = 0; i < 2; i++)
+ update_pat (i, adjusted_mems[i], punned_regs[i]);
+ return;
+ }
+
+ // If we failed to switch to a canonical mode, at least
+ // try and make sure the modes are the same.
+ if (modes[0] == modes[1])
+ return;
+
+ for (int i = 0; i < 2; i++)
+ {
+ rtx punned_reg = lowpart_subreg (modes[!i], regs[i], modes[i]);
+ if (!punned_reg)
+ continue;
+
+ rtx adjusted_mem = try_adjust_address (mems[i], modes[!i], 0);
+ if (!adjusted_mem)
+ continue;
+
+ if (dump_file)
+ fprintf (dump_file,
+ " failed to canonicalize, but made modes agree (%s -> %s)\n",
+ mode_name[modes[i]], mode_name[modes[!i]]);
+ update_pat (i, adjusted_mem, punned_reg);
+ return;
+ }
+
+ // Worst case, recog will bail out if the modes are still
+ // incompatible.
+}
+
+// If we have vector x scalar modes, pun into the scalar mode.
+// Otherwise, leave the modes unchanged.
+static bool
+unify_access_modes (rtx pats[2], bool load_p)
+{
+ machine_mode modes[2];
+ for (int i = 0; i < 2; i++)
+ modes[i] = GET_MODE (XEXP (pats[i], load_p));
+
+ if (VECTOR_MODE_P (modes[0]) == VECTOR_MODE_P (modes[1]))
+ return true;
+
+ const int vector_i = VECTOR_MODE_P (modes[1]);
+ const int scalar_i = !vector_i;
+
+ rtx vector_mem = XEXP (pats[vector_i], load_p);
+ rtx vector_reg = XEXP (pats[vector_i], !load_p);
+
+ if (!try_adjust_address (vector_mem, modes[scalar_i], 0))
+ {
+ if (dump_file)
+ fprintf (dump_file,
+ "failed to unify %smode -> %smode, fall back "
+ "to canonicalize modes\n",
+ mode_name[modes[vector_i]], mode_name[modes[scalar_i]]);
+ canonicalize_access_modes (pats, load_p);
+ return true;
+ }
+
+ rtx adjusted_mem = adjust_address (vector_mem, modes[scalar_i], 0);
+ rtx punned_reg = lowpart_subreg (modes[scalar_i],
+ vector_reg,
+ modes[vector_i]);
+ if (!punned_reg)
+ {
+ if (dump_file)
+ fprintf (dump_file, "Failed to unify %smode -> %smode\n",
+ mode_name[modes[vector_i]], mode_name[modes[scalar_i]]);
+ return false;
+ }
+
+ pats[vector_i] = load_p
+ ? gen_rtx_SET (punned_reg, adjusted_mem)
+ : gen_rtx_SET (adjusted_mem, punned_reg);
+ return true;
+}
+
+static rtx
+filter_notes (rtx note, rtx result, bool *eh_region)
+{
+ for (; note; note = XEXP (note, 1))
+ {
+ switch (REG_NOTE_KIND (note))
+ {
+ case REG_EQUAL:
+ case REG_EQUIV:
+ case REG_DEAD:
+ case REG_UNUSED:
+ case REG_NOALIAS:
+ // These can all be dropped. For REG_EQU{AL,IV} they
+ // cannot apply to non-single_set insns, and
+ // REG_{DEAD,UNUSED} are re-computed by RTl-SSA, see
+ // rtl-ssa/changes.cc:update_notes.
+ //
+ // Similarly, REG_NOALIAS cannot apply to a parallel.
+ break;
+ case REG_EH_REGION:
+ gcc_assert (!*eh_region);
+ *eh_region = true;
+ result = alloc_reg_note (REG_EH_REGION, XEXP (note, 0), result);
+ break;
+ case REG_CFA_OFFSET:
+ case REG_CFA_RESTORE:
+ result = alloc_reg_note (REG_NOTE_KIND (note),
+ copy_rtx (XEXP (note, 0)),
+ result);
+ break;
+ default:
+ // Unexpected REG_NOTE kind.
+ gcc_unreachable ();
+ }
+ }
+
+ return result;
+}
+
+// Ensure we have a sensible scheme for combining REG_NOTEs
+// given two candidate insns I1 and I2.
+static rtx
+combine_reg_notes (insn_info *i1, insn_info *i2)
+{
+ bool found_eh_region = false;
+ rtx result = NULL_RTX;
+ result = filter_notes (REG_NOTES (i1->rtl ()), result, &found_eh_region);
+ return filter_notes (REG_NOTES (i2->rtl ()), result, &found_eh_region);
+}
+
+// Try and actually fuse the pair given by insns I1 and I2.
+static bool
+fuse_pair (bool load_p,
+ insn_info *i1,
+ insn_info *i2,
+ base_cand &base,
+ const insn_range_info &move_range,
+ bool &emitted_tombstone_p)
+{
+ auto attempt = crtl->ssa->new_change_attempt ();
+
+ auto make_change = [&attempt](insn_info *insn)
+ {
+ return crtl->ssa->change_alloc <insn_change> (attempt, insn);
+ };
+ auto make_delete = [&attempt](insn_info *insn)
+ {
+ return crtl->ssa->change_alloc <insn_change> (attempt,
+ insn,
+ insn_change::DELETE);
+ };
+
+ // Are we using a tombstone insn for this pair?
+ bool have_tombstone_p = false;
+
+ insn_info *first = (*i1 < *i2) ? i1 : i2;
+ insn_info *second = (first == i1) ? i2 : i1;
+
+ insn_info *insns[2] = { first, second };
+
+ auto_vec <insn_change *> changes;
+ changes.reserve (3);
+
+ rtx pats[2] = {
+ PATTERN (i1->rtl ()),
+ PATTERN (i2->rtl ())
+ };
+
+ use_array input_uses[2] = { first->uses (), second->uses () };
+
+ if (base.from_insn != -1)
+ {
+ // If we're not already using a shared base, we need
+ // to re-write one of the accesses to use the base from
+ // the other insn.
+ gcc_checking_assert (base.from_insn == 0 || base.from_insn == 1);
+
+ const bool lower_base_p = (insns[base.from_insn] == i1);
+
+ rtx base_pat = PATTERN (insns[base.from_insn]->rtl ());
+ rtx change_pat = PATTERN (insns[!base.from_insn]->rtl ());
+ rtx base_mem = XEXP (base_pat, load_p);
+ rtx change_mem = XEXP (change_pat, load_p);
+
+ machine_mode mem_mode = GET_MODE (base_mem);
+ HOST_WIDE_INT adjust_amt = GET_MODE_SIZE (mem_mode).to_constant ();
+ if (!lower_base_p)
+ adjust_amt *= -1;
+
+ rtx change_reg = XEXP (change_pat, !load_p);
+ machine_mode mode_for_mem = GET_MODE (change_mem);
+ if (!try_adjust_address (base_mem, mode_for_mem, adjust_amt))
+ {
+ // We need to canonicalize the mode to make the adjustment.
+ // This should be guaranteed to work as we checked this in
+ // get_viable_bases.
+ mode_for_mem = canonical_mode_for_mode (mode_for_mem);
+ change_reg = lowpart_subreg (mode_for_mem,
+ change_reg,
+ GET_MODE (change_mem));
+ gcc_assert (change_reg);
+ }
+
+ rtx new_mem = adjust_address (base_mem, mode_for_mem, adjust_amt);
+ rtx new_set = load_p
+ ? gen_rtx_SET (change_reg, new_mem)
+ : gen_rtx_SET (new_mem, change_reg);
+
+ pats[lower_base_p] = new_set;
+
+ hash_set <use_info *> uses_to_drop;
+ use_array &uses_to_change = input_uses[!base.from_insn];
+
+ for (auto use : uses_to_change)
+ if (use->is_reg ()
+ && !refers_to_regno_p (use->regno (),
+ use->regno () + 1,
+ change_pat,
+ &XEXP (change_pat, load_p))
+ && uses_to_drop.add (use))
+ gcc_unreachable ();
+
+ if (!uses_to_drop.is_empty ())
+ {
+ access_array_builder builder (attempt);
+ gcc_checking_assert (uses_to_drop.elements ()
+ <= uses_to_change.size ());
+ builder.reserve (uses_to_change.size () - uses_to_drop.elements ());
+ auto it = uses_to_change.begin ();
+ auto end = uses_to_change.end ();
+ for (; it != end; ++it)
+ if (!uses_to_drop.contains (*it))
+ builder.quick_push (*it);
+ uses_to_change = use_array (builder.finish ());
+ }
+ }
+
+ if (aarch64_ldp_canonicalize_modes)
+ canonicalize_access_modes (pats, load_p);
+ else if (!unify_access_modes (pats, load_p))
+ return false;
+
+ // Go through and drop uses that only occur in register notes,
+ // as we won't be preserving those.
+ for (int i = 0; i < 2; i++)
+ {
+ auto rti = insns[i]->rtl ();
+ if (!REG_NOTES (rti))
+ continue;
+
+ input_uses[i] = remove_note_accesses (attempt, input_uses[i]);
+ }
+
+ // Now that we know what base mem we're going to use, check if it's OK
+ // with the ldp/stp policy.
+ rtx first_mem = XEXP (pats[0], load_p);
+ if (!aarch64_mem_ok_with_ldpstp_policy_model (first_mem,
+ load_p,
+ GET_MODE (first_mem)))
+ {
+ if (dump_file)
+ fprintf (dump_file, "punting on pair (%d,%d), ldp/stp policy says no\n",
+ i1->uid (), i2->uid ());
+ return false;
+ }
+
+ rtx reg_notes = combine_reg_notes (i1, i2);
+
+ rtx pair_pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, pats[0], pats[1]));
+
+ insn_change *pair_change = nullptr;
+ auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) {
+ rtx_insn *rti = change->insn ()->rtl ();
+ gcc_assert (validate_unshare_change (rti, &PATTERN (rti), pair_pat,
+ true));
+ gcc_assert (validate_change (rti, ®_NOTES (rti),
+ reg_notes, true));
+ };
+
+ if (load_p)
+ {
+ changes.quick_push (make_delete (first));
+ pair_change = make_change (second);
+ changes.quick_push (pair_change);
+
+ pair_change->move_range = move_range;
+ pair_change->new_defs = merge_access_arrays (attempt,
+ first->defs (),
+ second->defs ());
+ gcc_assert (pair_change->new_defs.is_valid ());
+
+ pair_change->new_uses
+ = merge_access_arrays (attempt,
+ drop_memory_access (input_uses[0]),
+ drop_memory_access (input_uses[1]));
+ gcc_assert (pair_change->new_uses.is_valid ());
+ set_pair_pat (pair_change);
+ }
+ else
+ {
+ change_strategy strategy[2];
+ def_info *stp_def = decide_stp_strategy (strategy, first, second,
+ move_range);
+ if (dump_file)
+ {
+ auto cs1 = cs_to_string (strategy[0]);
+ auto cs2 = cs_to_string (strategy[1]);
+ fprintf (dump_file,
+ " stp strategy for candidate insns (%d,%d): (%s,%s)\n",
+ insns[0]->uid (), insns[1]->uid (), cs1, cs2);
+ if (stp_def)
+ fprintf (dump_file,
+ " re-using mem def from insn %d\n",
+ stp_def->insn ()->uid ());
+ }
+
+ insn_change *change;
+ for (int i = 0; i < 2; i++)
+ {
+ switch (strategy[i])
+ {
+ case DELETE:
+ changes.quick_push (make_delete (insns[i]));
+ break;
+ case TOMBSTONE:
+ case CHANGE:
+ change = make_change (insns[i]);
+ if (strategy[i] == CHANGE)
+ {
+ set_pair_pat (change);
+ change->new_uses = merge_access_arrays (attempt,
+ input_uses[0],
+ input_uses[1]);
+ auto d1 = drop_memory_access (first->defs ());
+ auto d2 = drop_memory_access (second->defs ());
+ change->new_defs = merge_access_arrays (attempt, d1, d2);
+ gcc_assert (stp_def);
+ change->new_defs = insert_access (attempt,
+ stp_def,
+ change->new_defs);
+ change->move_range = move_range;
+ pair_change = change;
+ }
+ else
+ {
+ rtx_insn *rti = insns[i]->rtl ();
+ gcc_assert (validate_change (rti, &PATTERN (rti),
+ gen_tombstone (), true));
+ gcc_assert (validate_change (rti, ®_NOTES (rti),
+ NULL_RTX, true));
+ change->new_uses = use_array (nullptr, 0);
+ have_tombstone_p = true;
+ }
+ gcc_assert (change->new_defs.is_valid ());
+ gcc_assert (change->new_uses.is_valid ());
+ changes.quick_push (change);
+ break;
+ }
+ }
+
+ if (!stp_def)
+ {
+ // Tricky case. Cannot re-purpose existing insns for stp.
+ // Need to insert new insn.
+ if (dump_file)
+ fprintf (dump_file,
+ " stp fusion: cannot re-purpose candidate stores\n");
+
+ auto new_insn = crtl->ssa->create_insn (attempt, INSN, pair_pat);
+ change = make_change (new_insn);
+ change->move_range = move_range;
+ change->new_uses = merge_access_arrays (attempt,
+ input_uses[0],
+ input_uses[1]);
+ gcc_assert (change->new_uses.is_valid ());
+
+ auto d1 = drop_memory_access (first->defs ());
+ auto d2 = drop_memory_access (second->defs ());
+ change->new_defs = merge_access_arrays (attempt, d1, d2);
+ gcc_assert (change->new_defs.is_valid ());
+
+ auto new_set = crtl->ssa->create_set (attempt, new_insn, memory);
+ change->new_defs = insert_access (attempt, new_set,
+ change->new_defs);
+ gcc_assert (change->new_defs.is_valid ());
+ changes.safe_insert (1, change);
+ pair_change = change;
+ }
+ }
+
+ auto n_changes = changes.length ();
+ gcc_checking_assert (n_changes == 2 || n_changes == 3);
+
+ auto is_changing = insn_is_changing (changes);
+ for (unsigned i = 0; i < n_changes; i++)
+ gcc_assert (rtl_ssa::restrict_movement_ignoring (*changes[i], is_changing));
+
+ // Check the pair pattern is recog'd.
+ if (!rtl_ssa::recog_ignoring (attempt, *pair_change, is_changing))
+ {
+ if (dump_file)
+ fprintf (dump_file, " failed to form pair, recog failed\n");
+
+ // Free any reg notes we allocated.
+ while (reg_notes)
+ {
+ rtx next = XEXP (reg_notes, 1);
+ free_EXPR_LIST_node (reg_notes);
+ reg_notes = next;
+ }
+ cancel_changes (0);
+ return false;
+ }
+
+ gcc_assert (crtl->ssa->verify_insn_changes (changes));
+
+ confirm_change_group ();
+ crtl->ssa->change_insns (changes);
+ emitted_tombstone_p |= have_tombstone_p;
+ return true;
+}
+
+// Return true if STORE_INSN may modify mem rtx MEM. Make sure we keep
+// within our BUDGET for alias analysis.
+static bool
+store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget)
+{
+ if (tombstone_insn_p (store_insn))
+ return false;
+
+ if (!budget)
+ {
+ if (dump_file)
+ {
+ fprintf (dump_file,
+ "exceeded budget, assuming store %d aliases with mem ",
+ store_insn->uid ());
+ print_simple_rtl (dump_file, mem);
+ fprintf (dump_file, "\n");
+ }
+
+ return true;
+ }
+
+ budget--;
+ return memory_modified_in_insn_p (mem, store_insn->rtl ());
+}
+
+// Return true if LOAD may be modified by STORE. Make sure we keep
+// within our BUDGET for alias analysis.
+static bool
+load_modified_by_store_p (insn_info *load,
+ insn_info *store,
+ int &budget)
+{
+ gcc_checking_assert (budget >= 0);
+
+ if (!budget)
+ {
+ if (dump_file)
+ {
+ fprintf (dump_file,
+ "exceeded budget, assuming load %d aliases with store %d\n",
+ load->uid (), store->uid ());
+ }
+ return true;
+ }
+
+ // It isn't safe to re-order stores over calls.
+ if (CALL_P (load->rtl ()))
+ return true;
+
+ budget--;
+ return modified_in_p (PATTERN (load->rtl ()), store->rtl ());
+}
+
+struct alias_walker
+{
+ virtual insn_info *insn () const = 0;
+ virtual bool valid () const = 0;
+ virtual bool conflict_p (int &budget) const = 0;
+ virtual void advance () = 0;
+};
+
+template<bool reverse>
+class store_walker : public alias_walker
+{
+ using def_iter_t = typename std::conditional <reverse,
+ reverse_def_iterator, def_iterator>::type;
+
+ def_iter_t def_iter;
+ rtx cand_mem;
+ insn_info *limit;
+
+public:
+ store_walker (def_info *mem_def, rtx mem, insn_info *limit_insn) :
+ def_iter (mem_def), cand_mem (mem), limit (limit_insn) {}
+
+ bool valid () const override
+ {
+ if (!*def_iter)
+ return false;
+
+ if (reverse)
+ return *((*def_iter)->insn ()) > *limit;
+ else
+ return *((*def_iter)->insn ()) < *limit;
+ }
+ insn_info *insn () const override { return (*def_iter)->insn (); }
+ void advance () override { def_iter++; }
+ bool conflict_p (int &budget) const override
+ {
+ return store_modifies_mem_p (cand_mem, insn (), budget);
+ }
+};
+
+template<bool reverse>
+class load_walker : public alias_walker
+{
+ using def_iter_t = typename std::conditional <reverse,
+ reverse_def_iterator, def_iterator>::type;
+ using use_iter_t = typename std::conditional <reverse,
+ reverse_use_iterator, nondebug_insn_use_iterator>::type;
+
+ def_iter_t def_iter;
+ use_iter_t use_iter;
+ insn_info *cand_store;
+ insn_info *limit;
+
+ static use_info *start_use_chain (def_iter_t &def_iter)
+ {
+ set_info *set = nullptr;
+ for (; *def_iter; def_iter++)
+ {
+ set = dyn_cast <set_info *> (*def_iter);
+ if (!set)
+ continue;
+
+ use_info *use = reverse
+ ? set->last_nondebug_insn_use ()
+ : set->first_nondebug_insn_use ();
+
+ if (use)
+ return use;
+ }
+
+ return nullptr;
+ }
+
+public:
+ void advance () override
+ {
+ use_iter++;
+ if (*use_iter)
+ return;
+ def_iter++;
+ use_iter = start_use_chain (def_iter);
+ }
+
+ insn_info *insn () const override
+ {
+ gcc_checking_assert (*use_iter);
+ return (*use_iter)->insn ();
+ }
+
+ bool valid () const override
+ {
+ if (!*use_iter)
+ return false;
+
+ if (reverse)
+ return *((*use_iter)->insn ()) > *limit;
+ else
+ return *((*use_iter)->insn ()) < *limit;
+ }
+
+ bool conflict_p (int &budget) const override
+ {
+ return load_modified_by_store_p (insn (), cand_store, budget);
+ }
+
+ load_walker (def_info *def, insn_info *store, insn_info *limit_insn)
+ : def_iter (def), use_iter (start_use_chain (def_iter)),
+ cand_store (store), limit (limit_insn) {}
+};
+
+// Process our alias_walkers in a round-robin fashion, proceeding until
+// nothing more can be learned from alias analysis.
+//
+// We try to maintain the invariant that if a walker becomes invalid, we
+// set its pointer to null.
+static void
+do_alias_analysis (insn_info *alias_hazards[4],
+ alias_walker *walkers[4],
+ bool load_p)
+{
+ const int n_walkers = 2 + (2 * !load_p);
+ int budget = aarch64_ldp_alias_check_limit;
+
+ auto next_walker = [walkers,n_walkers](int current) -> int {
+ for (int j = 1; j <= n_walkers; j++)
+ {
+ int idx = (current + j) % n_walkers;
+ if (walkers[idx])
+ return idx;
+ }
+ return -1;
+ };
+
+ int i = -1;
+ for (int j = 0; j < n_walkers; j++)
+ {
+ alias_hazards[j] = nullptr;
+ if (!walkers[j])
+ continue;
+
+ if (!walkers[j]->valid ())
+ walkers[j] = nullptr;
+ else if (i == -1)
+ i = j;
+ }
+
+ while (i >= 0)
+ {
+ int insn_i = i % 2;
+ int paired_i = (i & 2) + !insn_i;
+ int pair_fst = (i & 2);
+ int pair_snd = (i & 2) + 1;
+
+ if (walkers[i]->conflict_p (budget))
+ {
+ alias_hazards[i] = walkers[i]->insn ();
+
+ // We got an aliasing conflict for this {load,store} walker,
+ // so we don't need to walk any further.
+ walkers[i] = nullptr;
+
+ // If we have a pair of alias conflicts that prevent
+ // forming the pair, stop. There's no need to do further
+ // analysis.
+ if (alias_hazards[paired_i]
+ && (*alias_hazards[pair_fst] <= *alias_hazards[pair_snd]))
+ return;
+
+ if (!load_p)
+ {
+ int other_pair_fst = (pair_fst ? 0 : 2);
+ int other_paired_i = other_pair_fst + !insn_i;
+
+ int x_pair_fst = (i == pair_fst) ? i : other_paired_i;
+ int x_pair_snd = (i == pair_fst) ? other_paired_i : i;
+
+ // Similarly, handle the case where we have a {load,store}
+ // or {store,load} alias hazard pair that prevents forming
+ // the pair.
+ if (alias_hazards[other_paired_i]
+ && *alias_hazards[x_pair_fst] <= *alias_hazards[x_pair_snd])
+ return;
+ }
+ }
+
+ if (walkers[i])
+ {
+ walkers[i]->advance ();
+
+ if (!walkers[i]->valid ())
+ walkers[i] = nullptr;
+ }
+
+ i = next_walker (i);
+ }
+}
+
+static void
+get_viable_bases (insn_info *insns[2],
+ vec <base_cand> &base_cands,
+ rtx cand_mems[2],
+ rtx reg_ops[2],
+ unsigned access_size,
+ bool reversed)
+{
+ // We discovered this pair through a common MEM_EXPR base.
+ // Need to ensure that we have a common base register def
+ // that is live at both locations.
+ def_info *base_defs[2] = {};
+ for (int i = 0; i < 2; i++)
+ {
+ const bool is_lower = (i == reversed);
+ poly_int64 poly_off;
+ rtx addr = strip_offset (XEXP (cand_mems[i], 0), &poly_off);
+
+ if (!REG_P (addr) || !poly_off.is_constant ())
+ continue;
+
+ // Punt on accesses relative to eliminable regs. Since we don't know the
+ // elimination offset pre-RA, we should postpone forming pairs on such
+ // accesses until after RA.
+ if (!reload_completed
+ && (REGNO (addr) == FRAME_POINTER_REGNUM
+ || REGNO (addr) == ARG_POINTER_REGNUM))
+ continue;
+
+ HOST_WIDE_INT base_off = poly_off.to_constant ();
+
+ // It should be unlikely that we ever punt here, since MEM_EXPR offset
+ // alignment should be a good proxy for register offset alignment.
+ if (base_off % access_size != 0)
+ {
+ if (dump_file)
+ fprintf (dump_file,
+ "MEM_EXPR offset aligned, reg offset unaligned "
+ "(insn %d)\n",
+ insns[i]->uid ());
+ continue;
+ }
+
+ base_off /= access_size;
+
+ if (!is_lower)
+ base_off--;
+
+ if (base_off < LDP_MIN_IMM || base_off > LDP_MAX_IMM)
+ continue;
+
+ for (auto use : insns[i]->uses ())
+ if (use->is_reg () && use->regno () == REGNO (addr))
+ {
+ base_defs[i] = use->def ();
+ break;
+ }
+ }
+
+ if (!base_defs[0] && !base_defs[1])
+ {
+ if (dump_file)
+ fprintf (dump_file, "no viable base register for pair (%d,%d)\n",
+ insns[0]->uid (), insns[1]->uid ());
+ return;
+ }
+
+ if (base_defs[0] == base_defs[1])
+ {
+ // Easy case: insns already share the same base reg def.
+ base_cands.quick_push (base_defs[0]);
+ return;
+ }
+
+ if (base_defs[0] && base_defs[1]
+ && base_defs[0]->regno () == base_defs[1]->regno ())
+ {
+ // Accesses see different versions of the same
+ // base register, i.e. the base register gets updated
+ // between the two accesses.
+ //
+ // For now, we punt on this, but TODO: we should try
+ // to use an auto-inc ldp/stp where possible here.
+ if (dump_file)
+ fprintf (dump_file, "punting on base register update (%d,%d)\n",
+ insns[0]->uid (), insns[1]->uid ());
+ return;
+ }
+
+ for (int i = 0; i < 2; i++)
+ {
+ if (!base_defs[i])
+ continue;
+
+ // We already know that the offset is such that a valid pair insn
+ // can be formed, but given that we're changing a base, we need to
+ // check that we can safely adjust the mem to get a suitable
+ // paired mem while still satisfying "m".
+ const bool is_lower = (i == reversed);
+ rtx mem = cand_mems[i];
+ rtx other_mem = cand_mems[!i];
+ HOST_WIDE_INT adjust_amt = access_size;
+ if (!is_lower)
+ adjust_amt *= -1;
+ machine_mode this_mode = GET_MODE (mem);
+ machine_mode new_mode = GET_MODE (other_mem);
+ if (!try_adjust_address (mem, new_mode, adjust_amt))
+ {
+ auto canon_mode = canonical_mode_for_mode (new_mode);
+ if (canon_mode == new_mode)
+ {
+ if (dump_file)
+ fprintf (dump_file,
+ "insn (%d): base not viable, can't adjust mem by %"
+ PRId64 " (from %smode -> %smode)\n",
+ insns[i]->uid (), adjust_amt,
+ mode_name[this_mode], mode_name[new_mode]);
+ continue;
+ }
+
+ if (!try_adjust_address (mem, canon_mode, adjust_amt))
+ {
+
+ if (dump_file)
+ fprintf (dump_file,
+ "insn (%d): base not viable, can't adjust mem by %"
+ PRId64 " (%s -> %s) or in canonical %smode\n",
+ insns[i]->uid (), adjust_amt,
+ mode_name[this_mode], mode_name[new_mode],
+ mode_name[canon_mode]);
+ continue;
+ }
+
+ rtx reg_op = lowpart_subreg (canon_mode, reg_ops[!i], new_mode);
+ if (!reg_op)
+ {
+ if (dump_file)
+ fprintf (dump_file,
+ "insn (%d): base not viable, can only adjust mem "
+ "in %smode, but reg op can't be punned from %smode\n",
+ insns[i]->uid (),
+ mode_name[canon_mode], mode_name[new_mode]);
+ continue;
+ }
+
+ if (dump_file)
+ fprintf (dump_file,
+ "insn (%d): can't adjust mem by %" PRId64
+ " (%s -> %s) but can in (canonical) %smode\n",
+ insns[i]->uid (), adjust_amt,
+ mode_name[this_mode], mode_name[new_mode],
+ mode_name[canon_mode]);
+ }
+
+ base_cands.quick_push (base_cand {base_defs[i], i});
+ }
+}
+
+// Given two adjacent memory accesses of the same size, I1 and I2, try
+// and see if we can merge them into a ldp or stp.
+static bool
+try_fuse_pair (bool load_p,
+ unsigned access_size,
+ insn_info *i1,
+ insn_info *i2,
+ base_info binfo,
+ bool &emitted_tombstone_p)
+{
+ if (dump_file)
+ fprintf (dump_file, "analyzing pair (load=%d): (%d,%d)\n",
+ load_p, i1->uid (), i2->uid ());
+
+ insn_info *insns[2];
+ bool reversed = false;
+ if (*i1 < *i2)
+ {
+ insns[0] = i1;
+ insns[1] = i2;
+ }
+ else
+ {
+ insns[0] = i2;
+ insns[1] = i1;
+ reversed = true;
+ }
+
+ rtx cand_mems[2];
+ rtx reg_ops[2];
+ for (int i = 0; i < 2; i++)
+ {
+ rtx pat = PATTERN (insns[i]->rtl ());
+ cand_mems[i] = XEXP (pat, load_p);
+ reg_ops[i] = XEXP (pat, !load_p);
+ }
+
+ if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1]))
+ {
+ if (dump_file)
+ fprintf (dump_file,
+ "punting on ldp due to reg conflcits (%d,%d)\n",
+ insns[0]->uid (), insns[1]->uid ());
+ return false;
+ }
+
+ if (cfun->can_throw_non_call_exceptions
+ && insn_could_throw_p (insns[0]->rtl ())
+ && insn_could_throw_p (insns[1]->rtl ()))
+ {
+ if (dump_file)
+ fprintf (dump_file,
+ "can't combine insns with EH side effects (%d,%d)\n",
+ insns[0]->uid (), insns[1]->uid ());
+ return false;
+ }
+
+ auto_vec <base_cand> base_cands;
+ base_cands.reserve (2);
+ if (binfo.is_reg ())
+ // Simple case: both accesses using the same base register def.
+ base_cands.quick_push (binfo.get_reg ());
+ else
+ {
+ get_viable_bases (insns, base_cands, cand_mems,
+ reg_ops, access_size, reversed);
+ if (base_cands.is_empty ())
+ return false;
+ }
+
+ for (auto use : insns[1]->uses ())
+ if (!use->is_mem () && use->def () && use->def ()->insn () == insns[0])
+ {
+ if (dump_file)
+ fprintf (dump_file, "%d has true dependence on %d, rejecting pair\n",
+ insns[1]->uid (), insns[0]->uid ());
+ return false;
+ }
+
+ unsigned i = 0;
+ while (i < base_cands.length ())
+ {
+ base_cand &cand = base_cands[i];
+
+ rtx *ignore[2] = {};
+ for (int j = 0; j < 2; j++)
+ if (cand.from_insn == -1 || cand.from_insn == !j)
+ ignore[j] = &XEXP (cand_mems[j], 0);
+
+ insn_info *h = first_hazard_after (insns[0], ignore[0]);
+ if (h && *h <= *insns[1])
+ cand.hazards[0] = h;
+
+ h = latest_hazard_before (insns[1], ignore[1]);
+ if (h && *h >= *insns[0])
+ cand.hazards[1] = h;
+
+ if (!cand.viable ())
+ {
+ if (dump_file)
+ fprintf (dump_file,
+ "pair (%d,%d): rejecting base %d due to dataflow "
+ "hazards (%d,%d)\n",
+ insns[0]->uid (),
+ insns[1]->uid (),
+ cand.m_def->regno (),
+ cand.hazards[0]->uid (),
+ cand.hazards[1]->uid ());
+
+ base_cands.ordered_remove (i);
+ }
+ else
+ i++;
+ }
+
+ if (base_cands.is_empty ())
+ {
+ if (dump_file)
+ fprintf (dump_file,
+ "can't form pair (%d,%d) due to dataflow hazards\n",
+ insns[0]->uid (), insns[1]->uid ());
+ return false;
+ }
+
+ insn_info *alias_hazards[4] = {};
+
+ // First def of memory after the first insn, and last def of memory
+ // before the second insn, respectively.
+ def_info *mem_defs[2] = {};
+ if (load_p)
+ {
+ if (!MEM_READONLY_P (cand_mems[0]))
+ {
+ mem_defs[0] = memory_access (insns[0]->uses ())->def ();
+ gcc_checking_assert (mem_defs[0]);
+ mem_defs[0] = mem_defs[0]->next_def ();
+ }
+ if (!MEM_READONLY_P (cand_mems[1]))
+ {
+ mem_defs[1] = memory_access (insns[1]->uses ())->def ();
+ gcc_checking_assert (mem_defs[1]);
+ }
+ }
+ else
+ {
+ mem_defs[0] = memory_access (insns[0]->defs ())->next_def ();
+ mem_defs[1] = memory_access (insns[1]->defs ())->prev_def ();
+ gcc_checking_assert (mem_defs[0]);
+ gcc_checking_assert (mem_defs[1]);
+ }
+
+ store_walker<false> forward_store_walker (mem_defs[0],
+ cand_mems[0],
+ insns[1]);
+ store_walker<true> backward_store_walker (mem_defs[1],
+ cand_mems[1],
+ insns[0]);
+ alias_walker *walkers[4] = {};
+ if (mem_defs[0])
+ walkers[0] = &forward_store_walker;
+ if (mem_defs[1])
+ walkers[1] = &backward_store_walker;
+
+ if (load_p && (mem_defs[0] || mem_defs[1]))
+ do_alias_analysis (alias_hazards, walkers, load_p);
+ else
+ {
+ // We want to find any loads hanging off the first store.
+ mem_defs[0] = memory_access (insns[0]->defs ());
+ load_walker<false> forward_load_walker (mem_defs[0], insns[0], insns[1]);
+ load_walker<true> backward_load_walker (mem_defs[1], insns[1], insns[0]);
+ walkers[2] = &forward_load_walker;
+ walkers[3] = &backward_load_walker;
+ do_alias_analysis (alias_hazards, walkers, load_p);
+ // Now consolidate hazards back down.
+ if (alias_hazards[2]
+ && (!alias_hazards[0] || (*alias_hazards[2] < *alias_hazards[0])))
+ alias_hazards[0] = alias_hazards[2];
+
+ if (alias_hazards[3]
+ && (!alias_hazards[1] || (*alias_hazards[3] > *alias_hazards[1])))
+ alias_hazards[1] = alias_hazards[3];
+ }
+
+ if (alias_hazards[0] && alias_hazards[1]
+ && *alias_hazards[0] <= *alias_hazards[1])
+ {
+ if (dump_file)
+ fprintf (dump_file,
+ "cannot form pair (%d,%d) due to alias conflicts (%d,%d)\n",
+ i1->uid (), i2->uid (),
+ alias_hazards[0]->uid (), alias_hazards[1]->uid ());
+ return false;
+ }
+
+ // Now narrow the hazards on each base candidate using
+ // the alias hazards.
+ i = 0;
+ while (i < base_cands.length ())
+ {
+ base_cand &cand = base_cands[i];
+ if (alias_hazards[0] && (!cand.hazards[0]
+ || *alias_hazards[0] < *cand.hazards[0]))
+ cand.hazards[0] = alias_hazards[0];
+ if (alias_hazards[1] && (!cand.hazards[1]
+ || *alias_hazards[1] > *cand.hazards[1]))
+ cand.hazards[1] = alias_hazards[1];
+
+ if (cand.viable ())
+ i++;
+ else
+ {
+ if (dump_file)
+ fprintf (dump_file, "pair (%d,%d): rejecting base %d due to "
+ "alias/dataflow hazards (%d,%d)",
+ insns[0]->uid (), insns[1]->uid (),
+ cand.m_def->regno (),
+ cand.hazards[0]->uid (),
+ cand.hazards[1]->uid ());
+
+ base_cands.ordered_remove (i);
+ }
+ }
+
+ if (base_cands.is_empty ())
+ {
+ if (dump_file)
+ fprintf (dump_file,
+ "cannot form pair (%d,%d) due to alias/dataflow hazards",
+ insns[0]->uid (), insns[1]->uid ());
+
+ return false;
+ }
+
+ base_cand *base = &base_cands[0];
+ if (base_cands.length () > 1)
+ {
+ // If there are still multiple viable bases, it makes sense
+ // to choose one that allows us to reduce register pressure,
+ // for loads this means moving further down, for stores this
+ // means moving further up.
+ gcc_checking_assert (base_cands.length () == 2);
+ const int hazard_i = !load_p;
+ if (base->hazards[hazard_i])
+ {
+ if (!base_cands[1].hazards[hazard_i])
+ base = &base_cands[1];
+ else if (load_p
+ && *base_cands[1].hazards[hazard_i]
+ > *(base->hazards[hazard_i]))
+ base = &base_cands[1];
+ else if (!load_p
+ && *base_cands[1].hazards[hazard_i]
+ < *(base->hazards[hazard_i]))
+ base = &base_cands[1];
+ }
+ }
+
+ // Otherwise, hazards[0] > hazards[1].
+ // Pair can be formed anywhere in (hazards[1], hazards[0]).
+ insn_range_info range (insns[0], insns[1]);
+ if (base->hazards[1])
+ range.first = base->hazards[1];
+ if (base->hazards[0])
+ range.last = base->hazards[0]->prev_nondebug_insn ();
+
+ // Placement strategy: push loads down and pull stores up, this should
+ // help register pressure by reducing live ranges.
+ if (load_p)
+ range.first = range.last;
+ else
+ range.last = range.first;
+
+ if (dump_file)
+ {
+ auto print_hazard = [](insn_info *i)
+ {
+ if (i)
+ fprintf (dump_file, "%d", i->uid ());
+ else
+ fprintf (dump_file, "-");
+ };
+ auto print_pair = [print_hazard](insn_info **i)
+ {
+ print_hazard (i[0]);
+ fprintf (dump_file, ",");
+ print_hazard (i[1]);
+ };
+
+ fprintf (dump_file, "fusing pair [L=%d] (%d,%d), base=%d, hazards: (",
+ load_p, insns[0]->uid (), insns[1]->uid (),
+ base->m_def->regno ());
+ print_pair (base->hazards);
+ fprintf (dump_file, "), move_range: (%d,%d)\n",
+ range.first->uid (), range.last->uid ());
+ }
+
+ return fuse_pair (load_p, i1, i2, *base, range, emitted_tombstone_p);
+}
+
+// Erase [l.begin (), i] inclusive, respecting iterator order.
+static insn_iter_t
+erase_prefix (insn_list_t &l, insn_iter_t i)
+{
+ l.erase (l.begin (), std::next (i));
+ return l.begin ();
+}
+
+static insn_iter_t
+erase_one (insn_list_t &l, insn_iter_t i, insn_iter_t begin)
+{
+ auto prev_or_next = (i == begin) ? std::next (i) : std::prev (i);
+ l.erase (i);
+ return prev_or_next;
+}
+
+static void
+dump_insn_list (FILE *f, const insn_list_t &l)
+{
+ fprintf (f, "(");
+
+ auto i = l.begin ();
+ auto end = l.end ();
+
+ if (i != end)
+ fprintf (f, "%d", (*i)->uid ());
+ i++;
+
+ for (; i != end; i++)
+ {
+ fprintf (f, ", %d", (*i)->uid ());
+ }
+
+ fprintf (f, ")");
+}
+
+DEBUG_FUNCTION void
+debug (const insn_list_t &l)
+{
+ dump_insn_list (stderr, l);
+ fprintf (stderr, "\n");
+}
+
+void
+merge_pairs (insn_iter_t l_begin,
+ insn_iter_t l_end,
+ insn_iter_t r_begin,
+ insn_iter_t r_end,
+ insn_list_t &left_list,
+ insn_list_t &right_list,
+ hash_set <insn_info *> &to_delete,
+ bool load_p,
+ unsigned access_size,
+ base_info binfo,
+ bool &emitted_tombstone_p)
+{
+ auto iter_l = l_begin;
+ auto iter_r = r_begin;
+
+ bool result;
+ while (l_begin != l_end && r_begin != r_end)
+ {
+ auto next_l = std::next (iter_l);
+ auto next_r = std::next (iter_r);
+ if (**iter_l < **iter_r
+ && next_l != l_end
+ && **next_l < **iter_r)
+ {
+ iter_l = next_l;
+ continue;
+ }
+ else if (**iter_r < **iter_l
+ && next_r != r_end
+ && **next_r < **iter_l)
+ {
+ iter_r = next_r;
+ continue;
+ }
+
+ bool update_l = false;
+ bool update_r = false;
+
+ result = try_fuse_pair (load_p, access_size,
+ *iter_l, *iter_r, binfo,
+ emitted_tombstone_p);
+ if (result)
+ {
+ update_l = update_r = true;
+ if (to_delete.add (*iter_r))
+ gcc_unreachable (); // Shouldn't get added twice.
+
+ iter_l = erase_one (left_list, iter_l, l_begin);
+ iter_r = erase_one (right_list, iter_r, r_begin);
+ }
+ else
+ {
+ // Here we know that the entire prefix we skipped
+ // over cannot merge with anything further on
+ // in iteration order (there are aliasing hazards
+ // on both sides), so delete the entire prefix.
+ if (**iter_l < **iter_r)
+ {
+ // Delete everything from l_begin to iter_l, inclusive.
+ update_l = true;
+ iter_l = erase_prefix (left_list, iter_l);
+ }
+ else
+ {
+ // Delete everything from r_begin to iter_r, inclusive.
+ update_r = true;
+ iter_r = erase_prefix (right_list, iter_r);
+ }
+ }
+
+ if (update_l)
+ {
+ l_begin = left_list.begin ();
+ l_end = left_list.end ();
+ }
+ if (update_r)
+ {
+ r_begin = right_list.begin ();
+ r_end = right_list.end ();
+ }
+ }
+}
+
+// Given a list of insns LEFT_ORIG with all accesses adjacent to
+// those in RIGHT_ORIG, try and form them into pairs.
+//
+// Return true iff we formed all the RIGHT_ORIG candidates into
+// pairs.
+bool
+ldp_bb_info::try_form_pairs (insn_list_t *left_orig,
+ insn_list_t *right_orig,
+ bool load_p, unsigned access_size,
+ base_info binfo)
+{
+ // Make a copy of the right list which we can modify to
+ // exclude candidates locally for this invocation.
+ insn_list_t right_copy (*right_orig);
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "try_form_pairs [L=%d], cand vecs ", load_p);
+ dump_insn_list (dump_file, *left_orig);
+ fprintf (dump_file, " x ");
+ dump_insn_list (dump_file, right_copy);
+ fprintf (dump_file, "\n");
+ }
+
+ // List of candidate insns to delete from the original right_list
+ // (because they were formed into a pair).
+ hash_set <insn_info *> to_delete;
+
+ // Now we have a 2D matrix of candidates, traverse it to try and
+ // find a pair of insns that are already adjacent (within the
+ // merged list of accesses).
+ merge_pairs (left_orig->begin (), left_orig->end (),
+ right_copy.begin (), right_copy.end (),
+ *left_orig, right_copy,
+ to_delete, load_p, access_size, binfo,
+ m_emitted_tombstone);
+
+ // If we formed all right candidates into pairs,
+ // then we can skip the next iteration.
+ if (to_delete.elements () == right_orig->size ())
+ return true;
+
+ // Delete items from to_delete.
+ auto right_iter = right_orig->begin ();
+ auto right_end = right_orig->end ();
+ while (right_iter != right_end)
+ {
+ auto right_next = std::next (right_iter);
+
+ if (to_delete.contains (*right_iter))
+ {
+ right_orig->erase (right_iter);
+ right_end = right_orig->end ();
+ }
+
+ right_iter = right_next;
+ }
+
+ return false;
+}
+
+void
+ldp_bb_info::transform_for_base (base_info binfo,
+ int encoded_lfs,
+ access_group &group)
+{
+ const auto lfs = decode_lfs (encoded_lfs);
+ const unsigned access_size = lfs.size;
+
+ bool skip_next = true;
+ access_record *prev_access = nullptr;
+
+ for (auto &access : group.list)
+ {
+ if (skip_next)
+ skip_next = false;
+ else if (known_eq (access.offset, prev_access->offset + access_size))
+ skip_next = try_form_pairs (&prev_access->cand_insns,
+ &access.cand_insns,
+ lfs.load_p, access_size, binfo);
+
+ prev_access = &access;
+ }
+}
+
+void
+ldp_bb_info::cleanup_tombstones ()
+{
+ // No need to do anything if we didn't emit a tombstone insn for this bb.
+ if (!m_emitted_tombstone)
+ return;
+
+ insn_info *insn = m_bb->head_insn ();
+ while (insn)
+ {
+ insn_info *next = insn->next_nondebug_insn ();
+ if (!insn->is_real () || !tombstone_insn_p (insn))
+ {
+ insn = next;
+ continue;
+ }
+
+ auto def = memory_access (insn->defs ());
+ auto set = dyn_cast <set_info *> (def);
+ if (set && set->has_any_uses ())
+ {
+ def_info *prev_def = def->prev_def ();
+ auto prev_set = dyn_cast <set_info *> (prev_def);
+ if (!prev_set)
+ gcc_unreachable (); // TODO: handle this if needed.
+
+ while (set->first_use ())
+ crtl->ssa->reparent_use (set->first_use (), prev_set);
+ }
+
+ // Now set has no uses, we can delete it.
+ insn_change change (insn, insn_change::DELETE);
+ crtl->ssa->change_insn (change);
+ insn = next;
+ }
+}
+
+template<typename Map>
+void
+ldp_bb_info::traverse_base_map (Map &map)
+{
+ for (auto kv : map)
+ {
+ const auto &key = kv.first;
+ auto &value = kv.second;
+ const base_info binfo (key.first);
+ transform_for_base (binfo, key.second, value);
+ }
+}
+
+void
+ldp_bb_info::transform ()
+{
+ traverse_base_map (expr_map);
+ traverse_base_map (def_map);
+}
+
+static void
+ldp_fusion_init ()
+{
+ calculate_dominance_info (CDI_DOMINATORS);
+ df_analyze ();
+ crtl->ssa = new rtl_ssa::function_info (cfun);
+}
+
+static void
+ldp_fusion_destroy ()
+{
+ if (crtl->ssa->perform_pending_updates ())
+ cleanup_cfg (0);
+
+ free_dominance_info (CDI_DOMINATORS);
+
+ delete crtl->ssa;
+ crtl->ssa = nullptr;
+}
+
+void ldp_fusion_bb (bb_info *bb)
+{
+ const bool track_loads
+ = aarch64_tune_params.ldp_policy_model != AARCH64_LDP_STP_POLICY_NEVER;
+ const bool track_stores
+ = aarch64_tune_params.stp_policy_model != AARCH64_LDP_STP_POLICY_NEVER;
+
+ ldp_bb_info bb_state (bb);
+
+ for (auto insn : bb->nondebug_insns ())
+ {
+ rtx_insn *rti = insn->rtl ();
+
+ if (!rti || !INSN_P (rti))
+ continue;
+
+ rtx pat = PATTERN (rti);
+
+ if (GET_CODE (pat) != SET)
+ continue;
+
+ if (track_stores && MEM_P (XEXP (pat, 0)))
+ bb_state.track_access (insn, false, XEXP (pat, 0));
+ else if (track_loads && MEM_P (XEXP (pat, 1)))
+ bb_state.track_access (insn, true, XEXP (pat, 1));
+ }
+
+ bb_state.transform ();
+ bb_state.cleanup_tombstones ();
+}
+
+void ldp_fusion ()
+{
+ ldp_fusion_init ();
+
+ for (auto bb : crtl->ssa->bbs ())
+ ldp_fusion_bb (bb);
+
+ ldp_fusion_destroy ();
+}
+
+namespace {
+
+const pass_data pass_data_ldp_fusion =
+{
+ RTL_PASS, /* type */
+ "ldp_fusion", /* name */
+ OPTGROUP_NONE, /* optinfo_flags */
+ TV_NONE, /* tv_id */
+ 0, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_ldp_fusion : public rtl_opt_pass
+{
+public:
+ pass_ldp_fusion (gcc::context *ctx)
+ : rtl_opt_pass (pass_data_ldp_fusion, ctx)
+ {}
+
+ opt_pass *clone () override { return new pass_ldp_fusion (m_ctxt); }
+
+ bool gate (function *) final override
+ {
+ if (!optimize || optimize_debug)
+ return false;
+
+ // If the tuning policy says never to form ldps or stps, don't run
+ // the pass.
+ if ((aarch64_tune_params.ldp_policy_model
+ == AARCH64_LDP_STP_POLICY_NEVER)
+ && (aarch64_tune_params.stp_policy_model
+ == AARCH64_LDP_STP_POLICY_NEVER))
+ return false;
+
+ if (reload_completed)
+ return flag_aarch64_late_ldp_fusion;
+ else
+ return flag_aarch64_early_ldp_fusion;
+ }
+
+ unsigned execute (function *) final override
+ {
+ ldp_fusion ();
+ return 0;
+ }
+};
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_ldp_fusion (gcc::context *ctx)
+{
+ return new pass_ldp_fusion (ctx);
+}
+
+#include "gt-aarch64-ldp-fusion.h"
diff --git a/gcc/config/aarch64/aarch64-passes.def b/gcc/config/aarch64/aarch64-passes.def
index 6ace797b738..f38c642414e 100644
--- a/gcc/config/aarch64/aarch64-passes.def
+++ b/gcc/config/aarch64/aarch64-passes.def
@@ -23,3 +23,5 @@ INSERT_PASS_BEFORE (pass_reorder_blocks, 1, pass_track_speculation);
INSERT_PASS_AFTER (pass_machine_reorg, 1, pass_tag_collision_avoidance);
INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_bti);
INSERT_PASS_AFTER (pass_if_after_combine, 1, pass_cc_fusion);
+INSERT_PASS_BEFORE (pass_early_remat, 1, pass_ldp_fusion);
+INSERT_PASS_BEFORE (pass_peephole2, 1, pass_ldp_fusion);
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 60a55f4bc19..1ccc516622b 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1049,6 +1049,7 @@ rtl_opt_pass *make_pass_track_speculation (gcc::context *);
rtl_opt_pass *make_pass_tag_collision_avoidance (gcc::context *);
rtl_opt_pass *make_pass_insert_bti (gcc::context *ctxt);
rtl_opt_pass *make_pass_cc_fusion (gcc::context *ctxt);
+rtl_opt_pass *make_pass_ldp_fusion (gcc::context *);
poly_uint64 aarch64_regmode_natural_size (machine_mode);
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index f5a518202a1..5d63b2ec8ac 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -271,6 +271,16 @@ mtrack-speculation
Target Var(aarch64_track_speculation)
Generate code to track when the CPU might be speculating incorrectly.
+mearly-ldp-fusion
+Target Var(flag_aarch64_early_ldp_fusion) Optimization Init(1)
+Enable the pre-RA AArch64-specific pass to fuse loads and stores into
+ldp and stp instructions.
+
+mlate-ldp-fusion
+Target Var(flag_aarch64_late_ldp_fusion) Optimization Init(1)
+Enable the post-RA AArch64-specific pass to fuse loads and stores into
+ldp and stp instructions.
+
mstack-protector-guard=
Target RejectNegative Joined Enum(stack_protector_guard) Var(aarch64_stack_protector_guard) Init(SSP_GLOBAL)
Use given stack-protector guard.
@@ -360,3 +370,13 @@ Enum(aarch64_ldp_stp_policy) String(never) Value(AARCH64_LDP_STP_POLICY_NEVER)
EnumValue
Enum(aarch64_ldp_stp_policy) String(aligned) Value(AARCH64_LDP_STP_POLICY_ALIGNED)
+
+-param=aarch64-ldp-alias-check-limit=
+Target Joined UInteger Var(aarch64_ldp_alias_check_limit) Init(8) IntegerRange(0, 65536) Param
+Limit on number of alias checks performed when attempting to form an ldp/stp.
+
+-param=aarch64-ldp-canonicalize-modes=
+Target Joined UInteger Var(aarch64_ldp_canonicalize_modes) Init(0) IntegerRange(0,1) Param
+Debugging param to change the strategy for adjusting modes when forming load
+and store pairs. When set to 0, we only ensure modes agree on VECTOR_MODE_P.
+When set to 1, we canonicalize to a single mode for a given size.
diff --git a/gcc/config/aarch64/t-aarch64 b/gcc/config/aarch64/t-aarch64
index a9a244ab6d6..37917344a54 100644
--- a/gcc/config/aarch64/t-aarch64
+++ b/gcc/config/aarch64/t-aarch64
@@ -176,6 +176,13 @@ aarch64-cc-fusion.o: $(srcdir)/config/aarch64/aarch64-cc-fusion.cc \
$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
$(srcdir)/config/aarch64/aarch64-cc-fusion.cc
+aarch64-ldp-fusion.o: $(srcdir)/config/aarch64/aarch64-ldp-fusion.cc \
+ $(CONFIG_H) $(SYSTEM_H) $(CORETYPES_H) $(BACKEND_H) $(RTL_H) $(DF_H) \
+ $(RTL_SSA_H) cfgcleanup.h tree-pass.h ordered-hash-map.h tree-dfa.h \
+ fold-const.h tree-hash-traits.h print-tree.h
+ $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+ $(srcdir)/config/aarch64/aarch64-ldp-fusion.cc
+
comma=,
MULTILIB_OPTIONS = $(subst $(comma),/, $(patsubst %, mabi=%, $(subst $(comma),$(comma)mabi=,$(TM_MULTILIB_CONFIG))))
MULTILIB_DIRNAMES = $(subst $(comma), ,$(TM_MULTILIB_CONFIG))