[v2,11/11] aarch64: Add new load/store pair fusion pass

Message ID ZTFDa/6aOhyZIAMz@arm.com
State Unresolved
Headers
Series None |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

Alex Coplan Oct. 19, 2023, 2:55 p.m. UTC
  Hi,

This v2 fixes a significant compile-time hog in the original patch
for the pass posted here:
https://gcc.gnu.org/pipermail/gcc-patches/2023-October/633355.html

Having seen a large compile-time regression when turning on the early
ldp pass, compiling 502.gcc_r from SPEC CPU 2017, I found that the
benchmark's insn-attrtab.c dominated the compile time, and moreover that
compile time for that file increased by 3.79x when enabling the early
ldp fusion pass at -O.

Running cc1 under a profiler revealed that ~44% of the entire profile
was in rtx_equal_p (inlined via cleanup_tombstones into ldp_fusion_bb).
I missed that we can skip running cleanup_tombstones entirely in the
(common) case that we never emitted a tombstone insn for a given bb.

This patch implements that optimization.  This reduces the overhead for
the early ldp pass on that file to around 1%, which seems more like an
acceptable overhead for an additional pass.

Incrementally, the change is as follows:


A v2 patch for the pass is attached. Bootstrapped/regtested as a series
on aarch64-linux-gnu, OK for trunk?

Thanks,
Alex
diff --git a/gcc/config.gcc b/gcc/config.gcc
index fa192637a52..9c1a604bd5e 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -349,8 +349,8 @@ aarch64*-*-*)
 	c_target_objs="aarch64-c.o"
 	cxx_target_objs="aarch64-c.o"
 	d_target_objs="aarch64-d.o"
-	extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o aarch64-sve-builtins-sve2.o cortex-a57-fma-steering.o aarch64-speculation.o falkor-tag-collision-avoidance.o aarch-bti-insert.o aarch64-cc-fusion.o"
-	target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.cc \$(srcdir)/config/aarch64/aarch64-sve-builtins.h \$(srcdir)/config/aarch64/aarch64-sve-builtins.cc"
+	extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o aarch64-sve-builtins-sve2.o cortex-a57-fma-steering.o aarch64-speculation.o falkor-tag-collision-avoidance.o aarch-bti-insert.o aarch64-cc-fusion.o aarch64-ldp-fusion.o"
+	target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.cc \$(srcdir)/config/aarch64/aarch64-sve-builtins.h \$(srcdir)/config/aarch64/aarch64-sve-builtins.cc \$(srcdir)/config/aarch64/aarch64-ldp-fusion.cc"
 	target_has_targetm_common=yes
 	;;
 alpha*-*-*)
diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc b/gcc/config/aarch64/aarch64-ldp-fusion.cc
new file mode 100644
index 00000000000..f1621c9a384
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -0,0 +1,2395 @@
+// LoadPair fusion optimization pass for AArch64.
+// Copyright (C) 2023 Free Software Foundation, Inc.
+//
+// This file is part of GCC.
+//
+// GCC is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 3, or (at your option)
+// any later version.
+//
+// GCC is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+#define INCLUDE_ALGORITHM
+#define INCLUDE_FUNCTIONAL
+#define INCLUDE_LIST
+#define INCLUDE_TYPE_TRAITS
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "df.h"
+#include "rtl-ssa.h"
+#include "cfgcleanup.h"
+#include "tree-pass.h"
+#include "ordered-hash-map.h"
+#include "tree-dfa.h"
+#include "fold-const.h"
+#include "tree-hash-traits.h"
+#include "print-tree.h"
+
+using namespace rtl_ssa;
+
+enum
+{
+  LDP_IMM_BITS = 7,
+  LDP_IMM_MASK = (1 << LDP_IMM_BITS) - 1,
+  LDP_IMM_SIGN_BIT = (1 << (LDP_IMM_BITS - 1)),
+  LDP_MAX_IMM = LDP_IMM_SIGN_BIT - 1,
+  LDP_MIN_IMM = -LDP_MAX_IMM - 1,
+};
+
+// We pack these fields (load_p, fpsimd_p, and size) into an integer
+// (LFS) which we use as part of the key into the main hash tables.
+//
+// The idea is that we group candidates together only if they agree on
+// the fields below.  Candidates that disagree on any of these
+// properties shouldn't be merged together.
+struct lfs_fields
+{
+  bool load_p;
+  bool fpsimd_p;
+  unsigned size;
+};
+
+using insn_list_t = std::list <insn_info *>;
+using insn_iter_t = insn_list_t::iterator;
+
+// A tagged union representing either an RTL-SSA def_info base or a
+// tree decl base.
+class base_info
+{
+  pointer_mux <def_info, union tree_node> mux;
+
+public:
+  base_info (tree decl) : mux (decl) {}
+  base_info (def_info *def) : mux (def) {}
+
+  bool is_reg () const { return mux.is_first (); }
+  def_info *get_reg () const
+  {
+    gcc_checking_assert (mux.is_first ());
+    return mux.known_first ();
+  }
+};
+
+// Information about the accesses at a given offset from a particular
+// base.  Stored in an access_group, see below.
+struct access_record
+{
+  poly_int64 offset;
+  std::list<insn_info *> cand_insns;
+  std::list<access_record>::iterator place;
+
+  access_record (poly_int64 off) : offset (off) {}
+};
+
+// A group of accesses where adjacent accesses could be ldp/stp
+// candidates.  The splay tree supports efficient insertion,
+// while the list supports efficient iteration.
+struct access_group
+{
+  splay_tree <access_record *> tree;
+  std::list<access_record> list;
+
+  template<typename Alloc>
+  inline void track (Alloc node_alloc, poly_int64 offset, insn_info *insn);
+};
+
+// Information about a potential base candidate, used in try_fuse_pair.
+// There may be zero, one, or two viable RTL bases for a given pair.
+struct base_cand
+{
+  def_info *m_def;
+
+  // FROM_INSN is -1 if the base candidate is already shared by both
+  // candidate insns.  Otherwise it holds the index of the insn from
+  // which the base originated.
+  int from_insn;
+
+  // Initially: dataflow hazards that arise if we choose this base as
+  // the common base register for the pair.
+  //
+  // Later these get narrowed, taking alias hazards into account.
+  insn_info *hazards[2];
+
+  base_cand (def_info *def, int insn)
+    : m_def (def), from_insn (insn), hazards {nullptr, nullptr} {}
+
+  base_cand (def_info *def) : base_cand (def, -1) {}
+
+  bool viable () const
+  {
+    return !hazards[0] || !hazards[1] || (*hazards[0] > *hazards[1]);
+  }
+};
+
+// State used by the pass for a given basic block.
+struct ldp_bb_info
+{
+  using def_hash = nofree_ptr_hash <def_info>;
+  using expr_key_t = pair_hash <tree_operand_hash, int_hash <int, -1, -2>>;
+  using def_key_t = pair_hash <def_hash, int_hash <int, -1, -2>>;
+
+  // Map of <tree base, LFS> -> access_group.
+  ordered_hash_map <expr_key_t, access_group> expr_map;
+
+  // Map of <RTL-SSA def_info *, LFS> -> access_group.
+  ordered_hash_map <def_key_t, access_group> def_map;
+
+  static const size_t obstack_alignment = sizeof (void *);
+  bb_info *m_bb;
+
+  ldp_bb_info (bb_info *bb) : m_bb (bb), m_emitted_tombstone (false)
+  {
+    obstack_specify_allocation (&m_obstack, OBSTACK_CHUNK_SIZE,
+				obstack_alignment, obstack_chunk_alloc,
+				obstack_chunk_free);
+  }
+  ~ldp_bb_info ()
+  {
+    obstack_free (&m_obstack, nullptr);
+  }
+
+  inline void track_access (insn_info *, bool load, rtx mem);
+  inline void transform ();
+  inline void cleanup_tombstones ();
+
+private:
+  // Did we emit a tombstone insn for this bb?
+  bool m_emitted_tombstone;
+  obstack m_obstack;
+
+  inline splay_tree_node<access_record *> *node_alloc (access_record *);
+
+  template<typename Map>
+  inline void traverse_base_map (Map &map);
+  inline void transform_for_base (base_info binfo, int load_size,
+				  access_group &group);
+
+  inline bool try_form_pairs (insn_list_t *, insn_list_t *,
+			      bool load_p, unsigned access_size,
+			      base_info binfo);
+
+  inline bool track_via_mem_expr (insn_info *, rtx mem, lfs_fields lfs);
+};
+
+splay_tree_node<access_record *> *
+ldp_bb_info::node_alloc (access_record *access)
+{
+  using T = splay_tree_node<access_record *>;
+  void *addr = obstack_alloc (&m_obstack, sizeof (T));
+  return new (addr) T (access);
+}
+
+static bool
+mem_valid_p (rtx mem)
+{
+  addr_space_t as = MEM_ADDR_SPACE (mem);
+  machine_mode mode = GET_MODE (mem);
+  rtx addr = XEXP (mem, 0);
+  return memory_address_addr_space_p (mode, addr, as);
+}
+
+static rtx
+try_adjust_address (rtx mem, machine_mode mode, poly_int64 offset)
+{
+  gcc_checking_assert (mem_valid_p (mem));
+  rtx adjusted = adjust_address_nv (mem, mode, offset);
+  return mem_valid_p (adjusted) ? adjusted : NULL_RTX;
+}
+
+static bool
+ldp_operand_mode_p (machine_mode mode)
+{
+  const bool allow_qregs
+    = !(aarch64_tune_params.extra_tuning_flags
+	& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS);
+
+  if (VECTOR_MODE_P (mode))
+    {
+      const auto poly_size = GET_MODE_SIZE (mode);
+      if (!poly_size.is_constant ())
+	return false;
+
+      HOST_WIDE_INT size = poly_size.to_constant ();
+      return size == 4
+	|| size == 8
+	|| (size == 16 && allow_qregs);
+    }
+
+  switch (mode)
+    {
+    case E_SImode:
+    case E_DImode:
+    case E_SFmode:
+    case E_SDmode:
+    case E_DFmode:
+    case E_DDmode:
+      return true;
+    case E_TFmode:
+    case E_TDmode:
+      return allow_qregs;
+    case E_TImode:
+      return allow_qregs && reload_completed;
+    default:
+      return false;
+    }
+}
+
+static int
+encode_lfs (lfs_fields fields)
+{
+  int size_log2 = exact_log2 (fields.size);
+  gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4);
+  return ((int)fields.load_p << 3)
+    | ((int)fields.fpsimd_p << 2)
+    | (size_log2 - 2);
+}
+
+static lfs_fields
+decode_lfs (int lfs)
+{
+  bool load_p = (lfs & (1 << 3));
+  bool fpsimd_p = (lfs & (1 << 2));
+  unsigned size = 1U << ((lfs & 3) + 2);
+  return { load_p, fpsimd_p, size };
+}
+
+template<typename Alloc>
+void
+access_group::track (Alloc alloc_node, poly_int64 offset, insn_info *insn)
+{
+  auto insert_before = [&](std::list<access_record>::iterator after)
+    {
+      auto it = list.emplace (after, offset);
+      it->cand_insns.push_back (insn);
+      it->place = it;
+      return &*it;
+    };
+
+  if (!list.size ())
+    {
+      auto access = insert_before (list.end ());
+      tree.insert_max_node (alloc_node (access));
+      return;
+    }
+
+  auto compare = [&](splay_tree_node<access_record *> *node)
+    {
+      return compare_sizes_for_sort (offset, node->value ()->offset);
+    };
+  auto result = tree.lookup (compare);
+  splay_tree_node<access_record *> *node = tree.root ();
+  if (result == 0)
+    node->value ()->cand_insns.push_back (insn);
+  else
+    {
+      auto it = node->value ()->place;
+      auto after = (result > 0) ? std::next (it) : it;
+      auto access = insert_before (after);
+      tree.insert_child (node, result > 0, alloc_node (access));
+    }
+}
+
+bool
+ldp_bb_info::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
+{
+  if (!MEM_EXPR (mem) || !MEM_OFFSET_KNOWN_P (mem))
+    return false;
+
+  // Punt on auto-inc accesses for now so we don't have to deal
+  // with the complexity later on.  TODO: would be good to relax
+  // this eventually.
+  if (side_effects_p (XEXP (mem, 0)))
+    return false;
+
+  poly_int64 offset;
+  tree base_expr = get_addr_base_and_unit_offset (MEM_EXPR (mem),
+						  &offset);
+  if (!base_expr || !DECL_P (base_expr))
+    return false;
+
+  offset += MEM_OFFSET (mem);
+
+  const machine_mode mem_mode = GET_MODE (mem);
+  const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
+
+  // Punt on misaligned offsets.
+  if (offset.coeffs[0] & (mem_size - 1))
+    return false;
+
+  const auto key = std::make_pair (base_expr, encode_lfs (lfs));
+  access_group &group = expr_map.get_or_insert (key, NULL);
+  auto alloc = [&](access_record *access) { return node_alloc (access); };
+  group.track (alloc, offset, insn);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "[bb %u] tracking insn %d via ",
+	       m_bb->index (), insn->uid ());
+      print_node_brief (dump_file, "mem expr", base_expr, 0);
+      fprintf (dump_file, " [L=%d FP=%d, %smode, off=",
+	       lfs.load_p, lfs.fpsimd_p, mode_name[mem_mode]);
+      print_dec (offset, dump_file);
+      fprintf (dump_file, "]\n");
+    }
+
+  return true;
+}
+
+void
+ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
+{
+  // We can't combine volatile MEMs, so punt on these.
+  if (MEM_VOLATILE_P (mem))
+    return;
+
+  const machine_mode mem_mode = GET_MODE (mem);
+  if (!ldp_operand_mode_p (mem_mode))
+    return;
+
+  // Note ldp_operand_mode_p already rejected VL modes.
+  const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
+  const bool fpsimd_mode_p = GET_MODE_CLASS (mem_mode) != MODE_INT;
+
+  // For stores, validate the operand being stored.
+  if (!load_p)
+    {
+      rtx st_op = XEXP (PATTERN (insn->rtl ()), 1);
+      if (!register_operand (st_op, mem_mode)
+	  && (fpsimd_mode_p || !aarch64_reg_or_zero (st_op, mem_mode)))
+	return;
+    }
+
+  // N.B. we only want to segregate FP/SIMD accesses from integer accesses
+  // before RA.
+  const bool fpsimd_bit_p = !reload_completed && fpsimd_mode_p;
+  const lfs_fields lfs = { load_p, fpsimd_bit_p, mem_size };
+
+  if (track_via_mem_expr (insn, mem, lfs))
+    return;
+
+  rtx addr = XEXP (mem, 0);
+
+  // TODO: handle auto-inc addressing modes.  We probably want to
+  // record these separately (from the main hash table) and find pair
+  // opportunities by looking for other load/store instructions that are
+  // related by dataflow on the base register.
+  poly_int64 poly_off;
+  rtx base = strip_offset (addr, &poly_off);
+  if (!REG_P (base))
+    return;
+
+  // Punt on accesses relative to the eliminable regs: since we don't
+  // know the elimination offset pre-RA, we should postpone forming
+  // pairs on such accesses until after RA.
+  if (!reload_completed
+      && (REGNO (base) == FRAME_POINTER_REGNUM
+	  || REGNO (base) == ARG_POINTER_REGNUM))
+    return;
+
+  // No ldp instructions with variable-length addressing.
+  if (!poly_off.is_constant ())
+    return;
+
+  HOST_WIDE_INT offset = poly_off.to_constant ();
+
+  // ldp/stp only handle offsets that are a multiple of the access size.
+  if (offset % mem_size != 0)
+    return;
+
+  // Normalize the offset.
+  offset /= mem_size;
+
+  // Check if the offset is in range.
+  if (offset > LDP_MAX_IMM || offset < LDP_MIN_IMM)
+    return;
+
+  // Now need to find def of base register.
+  def_info *base_def;
+  use_info *base_use = nullptr;
+  for (auto use_info : insn->uses ())
+    if (use_info->is_reg () && use_info->regno () == REGNO (base))
+      {
+	base_use = use_info;
+	break;
+      }
+
+  gcc_assert (base_use);
+  base_def = base_use->def ();
+  if (!base_def)
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "base register (regno %d) of insn %d is undefined",
+		 REGNO (base), insn->uid ());
+      return;
+    }
+
+  const auto key = std::make_pair (base_def, encode_lfs (lfs));
+  access_group &group = def_map.get_or_insert (key, NULL);
+  auto alloc = [&](access_record *access) { return node_alloc (access); };
+  group.track (alloc, poly_off, insn);
+
+  if (dump_file)
+    fprintf (dump_file,
+	     "[bb %u] tracking insn %d [L=%d, FP=%d, %smode, off=%d]\n",
+	     m_bb->index (), insn->uid (), lfs.load_p, lfs.fpsimd_p,
+	     mode_name[mem_mode], (int)poly_off.to_constant ());
+}
+
+// Dummy predicate that never ignores any insns.
+static bool no_ignore (insn_info *) { return false; }
+
+// Return the latest dataflow hazard before INSN.
+//
+// If IGNORE is non-NULL, this points to a sub-rtx which we should
+// ignore for dataflow purposes.  This is needed when considering
+// changing the RTL base of an access discovered through a MEM_EXPR
+// base.
+//
+// N.B. we ignore any defs/uses of memory here as we deal with that
+// separately, making use of alias disambiguation.
+static insn_info *
+latest_hazard_before (insn_info *insn, rtx *ignore)
+{
+  insn_info *result = nullptr;
+  auto hazard = [insn, &result](insn_info *h)
+    {
+      gcc_checking_assert (*h < *insn);
+      if (!result || *h > *result)
+	result = h;
+    };
+
+  rtx pat = PATTERN (insn->rtl ());
+  auto ignore_use = [&](use_info *u)
+    {
+      if (u->is_mem ())
+	return true;
+
+      return !refers_to_regno_p (u->regno (), u->regno () + 1, pat, ignore);
+    };
+
+  // Find defs of uses in INSN (RaW).
+  for (auto use : insn->uses ())
+    if (!ignore_use (use) && use->def ())
+      hazard (use->def ()->insn ());
+
+  // Find previous defs (WaW) or previous uses (WaR) of defs in INSN.
+  for (auto def : insn->defs ())
+    {
+      if (def->is_mem ())
+	continue;
+
+      if (def->prev_def ())
+	{
+	  hazard (def->prev_def ()->insn ()); // WaW
+
+	  auto set = dyn_cast <set_info *> (def->prev_def ());
+	  if (set && set->has_nondebug_insn_uses ())
+	    for (auto use : set->reverse_nondebug_insn_uses ())
+	      if (use->insn () != insn)
+		{
+		  hazard (use->insn ()); // WaR
+		  break;
+		}
+	}
+
+      if (!HARD_REGISTER_NUM_P (def->regno ()))
+	continue;
+
+      // Also need to check backwards for call clobbers (WaW).
+      for (auto call_group : def->ebb ()->call_clobbers ())
+	{
+	  if (!call_group->clobbers (def->resource ()))
+	    continue;
+
+	  auto clobber_insn = prev_call_clobbers_ignoring (*call_group,
+							   def->insn (),
+							   no_ignore);
+	  if (clobber_insn)
+	    hazard (clobber_insn);
+	}
+
+    }
+
+  return result;
+}
+
+static insn_info *
+first_hazard_after (insn_info *insn, rtx *ignore)
+{
+  insn_info *result = nullptr;
+  auto hazard = [insn, &result](insn_info *h)
+    {
+      gcc_checking_assert (*h > *insn);
+      if (!result || *h < *result)
+	result = h;
+    };
+
+  rtx pat = PATTERN (insn->rtl ());
+  auto ignore_use = [&](use_info *u)
+    {
+      if (u->is_mem ())
+	return true;
+
+      return !refers_to_regno_p (u->regno (), u->regno () + 1, pat, ignore);
+    };
+
+  for (auto def : insn->defs ())
+    {
+      if (def->is_mem ())
+	continue;
+
+      if (def->next_def ())
+	hazard (def->next_def ()->insn ()); // WaW
+
+      auto set = dyn_cast <set_info *> (def);
+      if (set && set->has_nondebug_insn_uses ())
+	hazard (set->first_nondebug_insn_use ()->insn ()); // RaW
+
+      if (!HARD_REGISTER_NUM_P (def->regno ()))
+	continue;
+
+      // Also check for call clobbers of this def (WaW).
+      for (auto call_group : def->ebb ()->call_clobbers ())
+	{
+	  if (!call_group->clobbers (def->resource ()))
+	    continue;
+
+	  auto clobber_insn = next_call_clobbers_ignoring (*call_group,
+							   def->insn (),
+							   no_ignore);
+	  if (clobber_insn)
+	    hazard (clobber_insn);
+	}
+    }
+
+  // Find any subsequent defs of uses in INSN (WaR).
+  for (auto use : insn->uses ())
+    {
+      if (ignore_use (use))
+	continue;
+
+      if (use->def ())
+	{
+	  auto def = use->def ()->next_def ();
+	  if (def && def->insn () == insn)
+	    def = def->next_def ();
+
+	  if (def)
+	    hazard (def->insn ());
+	}
+
+      if (!HARD_REGISTER_NUM_P (use->regno ()))
+	continue;
+
+      // Also need to handle call clobbers of our uses (again WaR).
+      //
+      // See restrict_movement_for_uses_ignoring for why we don't
+      // need to check backwards for call clobbers.
+      for (auto call_group : use->ebb ()->call_clobbers ())
+	{
+	  if (!call_group->clobbers (use->resource ()))
+	    continue;
+
+	  auto clobber_insn = next_call_clobbers_ignoring (*call_group,
+							   use->insn (),
+							   no_ignore);
+	  if (clobber_insn)
+	    hazard (clobber_insn);
+	}
+    }
+
+  return result;
+}
+
+
+enum change_strategy {
+  CHANGE,
+  DELETE,
+  TOMBSTONE,
+};
+
+// Given a change_strategy S, convert it to a string (for output in the
+// dump file).
+static const char *cs_to_string (change_strategy s)
+{
+#define C(x) case x: return #x
+  switch (s)
+    {
+      C (CHANGE);
+      C (DELETE);
+      C (TOMBSTONE);
+    }
+#undef C
+  gcc_unreachable ();
+}
+
+// TODO: should this live in RTL-SSA?
+static bool
+ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2)
+{
+  // If either range is empty, then their intersection is empty.
+  if (!r1 || !r2)
+    return false;
+
+  // When do they not overlap? When one range finishes before the other
+  // starts, i.e. (*r1.last < *r2.first || *r2.last < *r1.first).
+  // Inverting this, we get the below.
+  return *r1.last >= *r2.first && *r2.last >= *r1.first;
+}
+
+// Get the range of insns that def feeds.
+static insn_range_info get_def_range (def_info *def)
+{
+  insn_info *last = def->next_def ()->insn ()->prev_nondebug_insn ();
+  return { def->insn (), last };
+}
+
+// Given a def (of memory), return the downwards range within which we
+// can safely move this def.
+static insn_range_info
+def_downwards_move_range (def_info *def)
+{
+  auto range = get_def_range (def);
+
+  auto set = dyn_cast <set_info *> (def);
+  if (!set || !set->has_any_uses ())
+    return range;
+
+  auto use = set->first_nondebug_insn_use ();
+  if (use)
+    range = move_earlier_than (range, use->insn ());
+
+  return range;
+}
+
+// Given a def (of memory), return the upwards range within which we can
+// safely move this def.
+static insn_range_info
+def_upwards_move_range (def_info *def)
+{
+  def_info *prev = def->prev_def ();
+  insn_range_info range { prev->insn (), def->insn () };
+
+  auto set = dyn_cast <set_info *> (prev);
+  if (!set || !set->has_any_uses ())
+    return range;
+
+  auto use = set->last_nondebug_insn_use ();
+  if (use)
+    range = move_later_than (range, use->insn ());
+
+  return range;
+}
+
+static def_info *
+decide_stp_strategy (change_strategy strategy[2],
+		     insn_info *first,
+		     insn_info *second,
+		     const insn_range_info &move_range)
+{
+  strategy[0] = CHANGE;
+  strategy[1] = DELETE;
+
+  unsigned viable = 0;
+  viable |= move_range.includes (first);
+  viable |= ((unsigned) move_range.includes (second)) << 1;
+
+  def_info * const defs[2] = {
+    memory_access (first->defs ()),
+    memory_access (second->defs ())
+  };
+  if (defs[0] == defs[1])
+    viable = 3; // No intervening store, either is viable.
+
+  if (!(viable & 1)
+      && ranges_overlap_p (move_range, def_downwards_move_range (defs[0])))
+    viable |= 1;
+  if (!(viable & 2)
+      && ranges_overlap_p (move_range, def_upwards_move_range (defs[1])))
+    viable |= 2;
+
+  if (viable == 2)
+    std::swap (strategy[0], strategy[1]);
+  else if (!viable)
+    // Tricky case: need to delete both accesses.
+    strategy[0] = DELETE;
+
+  for (int i = 0; i < 2; i++)
+    {
+      if (strategy[i] != DELETE)
+	continue;
+
+      // See if we can get away without a tombstone.
+      auto set = dyn_cast <set_info *> (defs[i]);
+      if (!set || !set->has_any_uses ())
+	continue; // We can indeed.
+
+      // If both sides are viable for re-purposing, and the other store's
+      // def doesn't have any uses, then we can delete the other store
+      // and re-purpose this store instead.
+      if (viable == 3)
+	{
+	  gcc_assert (strategy[!i] == CHANGE);
+	  auto other_set = dyn_cast <set_info *> (defs[!i]);
+	  if (!other_set || !other_set->has_any_uses ())
+	    {
+	      strategy[i] = CHANGE;
+	      strategy[!i] = DELETE;
+	      break;
+	    }
+	}
+
+      // Alas, we need a tombstone after all.
+      strategy[i] = TOMBSTONE;
+    }
+
+  for (int i = 0; i < 2; i++)
+    if (strategy[i] == CHANGE)
+      return defs[i];
+
+  return nullptr;
+}
+
+static GTY(()) rtx tombstone = NULL_RTX;
+
+// Generate the RTL pattern for a "tombstone"; used temporarily
+// during this pass to replace stores that are marked for deletion
+// where we can't immediately delete the store (e.g. if there are uses
+// hanging off its def of memory).
+//
+// These are deleted at the end of the pass and uses re-parented
+// appropriately at this point.
+static rtx
+gen_tombstone (void)
+{
+  if (!tombstone)
+    {
+      tombstone = gen_rtx_CLOBBER (VOIDmode,
+				   gen_rtx_MEM (BLKmode,
+						gen_rtx_SCRATCH (Pmode)));
+      return tombstone;
+    }
+
+  return copy_rtx (tombstone);
+}
+
+static bool
+tombstone_insn_p (insn_info *insn)
+{
+  rtx x = tombstone ? tombstone : gen_tombstone ();
+  return rtx_equal_p (PATTERN (insn->rtl ()), x);
+}
+
+// Given a mode MODE, return a mode of the same size which we will
+// attempt to use as a canonical load/store mode.
+static machine_mode
+canonical_mode_for_mode (machine_mode mode)
+{
+  switch (GET_MODE_SIZE (mode).to_constant ())
+    {
+    case 4:
+      return E_SImode;
+    case 8:
+      return E_DImode;
+    case 16:
+      return E_V16QImode;
+    }
+
+  gcc_unreachable ();
+}
+
+// Try to convert accesses to happen in a canonical mode where possible.
+static void
+canonicalize_access_modes (rtx pats[2], bool load_p)
+{
+  rtx regs[2];
+  rtx mems[2];
+  machine_mode modes[2];
+  machine_mode canon_mode;
+
+  auto update_pat = [&](int i, rtx mem, rtx reg)
+    {
+      if (load_p)
+	pats[i] = gen_rtx_SET (reg, mem);
+      else
+	pats[i] = gen_rtx_SET (mem, reg);
+    };
+
+  for (int i = 0; i < 2; i++)
+    {
+      mems[i] = XEXP (pats[i], load_p);
+      regs[i] = XEXP (pats[i], !load_p);
+      modes[i] = GET_MODE (mems[i]);
+    }
+
+  canon_mode = canonical_mode_for_mode (modes[0]);
+  gcc_checking_assert (canon_mode == canonical_mode_for_mode (modes[1]));
+
+  if (modes[0] == canon_mode && modes[1] == canon_mode)
+    return;
+
+  // See if we can use a punning subreg to convert the register's
+  // mode, try this up front as it may not be possible.
+  rtx punned_regs[2] = {};
+  rtx adjusted_mems[2] = {};
+  for (int i = 0; i < 2; i++)
+    {
+      punned_regs[i] = lowpart_subreg (canon_mode, regs[i], modes[i]);
+      if (!punned_regs[i])
+	{
+	  if (dump_file)
+	    fprintf (dump_file,
+		     "  failed to canonicalize reg op mode: %s -> %s\n",
+		     mode_name[modes[i]], mode_name[canon_mode]);
+	  continue;
+	}
+
+      adjusted_mems[i] = try_adjust_address (mems[i], canon_mode, 0);
+      if (!adjusted_mems[i])
+	{
+	  if (dump_file)
+	    fprintf (dump_file, "  failed to canonicalize mem mode: %s -> %s\n",
+		     mode_name[modes[i]], mode_name[canon_mode]);
+	}
+    }
+
+  if (adjusted_mems[0] && adjusted_mems[1])
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "  successfully canonicalized from (%s,%s) -> %smode\n",
+		 mode_name[modes[0]],
+		 mode_name[modes[1]],
+		 mode_name[canon_mode]);
+
+      for (int i = 0; i < 2; i++)
+	update_pat (i, adjusted_mems[i], punned_regs[i]);
+      return;
+    }
+
+  // If we failed to switch to a canonical mode, at least
+  // try and make sure the modes are the same.
+  if (modes[0] == modes[1])
+    return;
+
+  for (int i = 0; i < 2; i++)
+    {
+      rtx punned_reg = lowpart_subreg (modes[!i], regs[i], modes[i]);
+      if (!punned_reg)
+	continue;
+
+      rtx adjusted_mem = try_adjust_address (mems[i], modes[!i], 0);
+      if (!adjusted_mem)
+	continue;
+
+      if (dump_file)
+	fprintf (dump_file,
+		 "  failed to canonicalize, but made modes agree (%s -> %s)\n",
+		 mode_name[modes[i]], mode_name[modes[!i]]);
+      update_pat (i, adjusted_mem, punned_reg);
+      return;
+    }
+
+  // Worst case, recog will bail out if the modes are still
+  // incompatible.
+}
+
+// If we have vector x scalar modes, pun into the scalar mode.
+// Otherwise, leave the modes unchanged.
+static bool
+unify_access_modes (rtx pats[2], bool load_p)
+{
+  machine_mode modes[2];
+  for (int i = 0; i < 2; i++)
+    modes[i] = GET_MODE (XEXP (pats[i], load_p));
+
+  if (VECTOR_MODE_P (modes[0]) == VECTOR_MODE_P (modes[1]))
+    return true;
+
+  const int vector_i = VECTOR_MODE_P (modes[1]);
+  const int scalar_i = !vector_i;
+
+  rtx vector_mem = XEXP (pats[vector_i], load_p);
+  rtx vector_reg = XEXP (pats[vector_i], !load_p);
+
+  if (!try_adjust_address (vector_mem, modes[scalar_i], 0))
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "failed to unify %smode -> %smode, fall back "
+		 "to canonicalize modes\n",
+		 mode_name[modes[vector_i]], mode_name[modes[scalar_i]]);
+      canonicalize_access_modes (pats, load_p);
+      return true;
+    }
+
+  rtx adjusted_mem = adjust_address (vector_mem, modes[scalar_i], 0);
+  rtx punned_reg = lowpart_subreg (modes[scalar_i],
+				   vector_reg,
+				   modes[vector_i]);
+  if (!punned_reg)
+    {
+      if (dump_file)
+	fprintf (dump_file, "Failed to unify %smode -> %smode\n",
+		 mode_name[modes[vector_i]], mode_name[modes[scalar_i]]);
+      return false;
+    }
+
+  pats[vector_i] = load_p
+    ? gen_rtx_SET (punned_reg, adjusted_mem)
+    : gen_rtx_SET (adjusted_mem, punned_reg);
+  return true;
+}
+
+static rtx
+filter_notes (rtx note, rtx result, bool *eh_region)
+{
+  for (; note; note = XEXP (note, 1))
+    {
+      switch (REG_NOTE_KIND (note))
+	{
+	  case REG_EQUAL:
+	  case REG_EQUIV:
+	  case REG_DEAD:
+	  case REG_UNUSED:
+	  case REG_NOALIAS:
+	    // These can all be dropped.  For REG_EQU{AL,IV} they
+	    // cannot apply to non-single_set insns, and
+	    // REG_{DEAD,UNUSED} are re-computed by RTl-SSA, see
+	    // rtl-ssa/changes.cc:update_notes.
+	    //
+	    // Similarly, REG_NOALIAS cannot apply to a parallel.
+	    break;
+	  case REG_EH_REGION:
+	    gcc_assert (!*eh_region);
+	    *eh_region = true;
+	    result = alloc_reg_note (REG_EH_REGION, XEXP (note, 0), result);
+	    break;
+	  case REG_CFA_OFFSET:
+	  case REG_CFA_RESTORE:
+	    result = alloc_reg_note (REG_NOTE_KIND (note),
+				     copy_rtx (XEXP (note, 0)),
+				     result);
+	    break;
+	  default:
+	    // Unexpected REG_NOTE kind.
+	    gcc_unreachable ();
+	}
+    }
+
+  return result;
+}
+
+// Ensure we have a sensible scheme for combining REG_NOTEs
+// given two candidate insns I1 and I2.
+static rtx
+combine_reg_notes (insn_info *i1, insn_info *i2)
+{
+  bool found_eh_region = false;
+  rtx result = NULL_RTX;
+  result = filter_notes (REG_NOTES (i1->rtl ()), result, &found_eh_region);
+  return filter_notes (REG_NOTES (i2->rtl ()), result, &found_eh_region);
+}
+
+// Try and actually fuse the pair given by insns I1 and I2.
+static bool
+fuse_pair (bool load_p,
+	   insn_info *i1,
+	   insn_info *i2,
+	   base_cand &base,
+	   const insn_range_info &move_range,
+	   bool &emitted_tombstone_p)
+{
+  auto attempt = crtl->ssa->new_change_attempt ();
+
+  auto make_change = [&attempt](insn_info *insn)
+    {
+      return crtl->ssa->change_alloc <insn_change> (attempt, insn);
+    };
+  auto make_delete = [&attempt](insn_info *insn)
+    {
+      return crtl->ssa->change_alloc <insn_change> (attempt,
+						    insn,
+						    insn_change::DELETE);
+    };
+
+  // Are we using a tombstone insn for this pair?
+  bool have_tombstone_p = false;
+
+  insn_info *first = (*i1 < *i2) ? i1 : i2;
+  insn_info *second = (first == i1) ? i2 : i1;
+
+  insn_info *insns[2] = { first, second };
+
+  auto_vec <insn_change *> changes;
+  changes.reserve (3);
+
+  rtx pats[2] = {
+    PATTERN (i1->rtl ()),
+    PATTERN (i2->rtl ())
+  };
+
+  use_array input_uses[2] = { first->uses (), second->uses () };
+
+  if (base.from_insn != -1)
+    {
+      // If we're not already using a shared base, we need
+      // to re-write one of the accesses to use the base from
+      // the other insn.
+      gcc_checking_assert (base.from_insn == 0 || base.from_insn == 1);
+
+      const bool lower_base_p = (insns[base.from_insn] == i1);
+
+      rtx base_pat = PATTERN (insns[base.from_insn]->rtl ());
+      rtx change_pat = PATTERN (insns[!base.from_insn]->rtl ());
+      rtx base_mem = XEXP (base_pat, load_p);
+      rtx change_mem = XEXP (change_pat, load_p);
+
+      machine_mode mem_mode = GET_MODE (base_mem);
+      HOST_WIDE_INT adjust_amt = GET_MODE_SIZE (mem_mode).to_constant ();
+      if (!lower_base_p)
+	adjust_amt *= -1;
+
+      rtx change_reg = XEXP (change_pat, !load_p);
+      machine_mode mode_for_mem = GET_MODE (change_mem);
+      if (!try_adjust_address (base_mem, mode_for_mem, adjust_amt))
+	{
+	  // We need to canonicalize the mode to make the adjustment.
+	  // This should be guaranteed to work as we checked this in
+	  // get_viable_bases.
+	  mode_for_mem = canonical_mode_for_mode (mode_for_mem);
+	  change_reg = lowpart_subreg (mode_for_mem,
+				       change_reg,
+				       GET_MODE (change_mem));
+	  gcc_assert (change_reg);
+	}
+
+      rtx new_mem = adjust_address (base_mem, mode_for_mem, adjust_amt);
+      rtx new_set = load_p
+	? gen_rtx_SET (change_reg, new_mem)
+	: gen_rtx_SET (new_mem, change_reg);
+
+      pats[lower_base_p] = new_set;
+
+      hash_set <use_info *> uses_to_drop;
+      use_array &uses_to_change = input_uses[!base.from_insn];
+
+      for (auto use : uses_to_change)
+	if (use->is_reg ()
+	    && !refers_to_regno_p (use->regno (),
+				   use->regno () + 1,
+				   change_pat,
+				   &XEXP (change_pat, load_p))
+	    && uses_to_drop.add (use))
+	  gcc_unreachable ();
+
+      if (!uses_to_drop.is_empty ())
+	{
+	  access_array_builder builder (attempt);
+	  gcc_checking_assert (uses_to_drop.elements ()
+			       <= uses_to_change.size ());
+	  builder.reserve (uses_to_change.size () - uses_to_drop.elements ());
+	  auto it = uses_to_change.begin ();
+	  auto end = uses_to_change.end ();
+	  for (; it != end; ++it)
+	    if (!uses_to_drop.contains (*it))
+	      builder.quick_push (*it);
+	  uses_to_change = use_array (builder.finish ());
+	}
+    }
+
+  if (aarch64_ldp_canonicalize_modes)
+    canonicalize_access_modes (pats, load_p);
+  else if (!unify_access_modes (pats, load_p))
+    return false;
+
+  // Go through and drop uses that only occur in register notes,
+  // as we won't be preserving those.
+  for (int i = 0; i < 2; i++)
+    {
+      auto rti = insns[i]->rtl ();
+      if (!REG_NOTES (rti))
+	continue;
+
+      input_uses[i] = remove_note_accesses (attempt, input_uses[i]);
+    }
+
+  // Now that we know what base mem we're going to use, check if it's OK
+  // with the ldp/stp policy.
+  rtx first_mem = XEXP (pats[0], load_p);
+  if (!aarch64_mem_ok_with_ldpstp_policy_model (first_mem,
+						load_p,
+						GET_MODE (first_mem)))
+    {
+      if (dump_file)
+	fprintf (dump_file, "punting on pair (%d,%d), ldp/stp policy says no\n",
+		 i1->uid (), i2->uid ());
+      return false;
+    }
+
+  rtx reg_notes = combine_reg_notes (i1, i2);
+
+  rtx pair_pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, pats[0], pats[1]));
+
+  insn_change *pair_change = nullptr;
+  auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) {
+      rtx_insn *rti = change->insn ()->rtl ();
+      gcc_assert (validate_unshare_change (rti, &PATTERN (rti), pair_pat,
+					   true));
+      gcc_assert (validate_change (rti, &REG_NOTES (rti),
+				   reg_notes, true));
+  };
+
+  if (load_p)
+    {
+      changes.quick_push (make_delete (first));
+      pair_change = make_change (second);
+      changes.quick_push (pair_change);
+
+      pair_change->move_range = move_range;
+      pair_change->new_defs = merge_access_arrays (attempt,
+					      first->defs (),
+					      second->defs ());
+      gcc_assert (pair_change->new_defs.is_valid ());
+
+      pair_change->new_uses
+	= merge_access_arrays (attempt,
+			       drop_memory_access (input_uses[0]),
+			       drop_memory_access (input_uses[1]));
+      gcc_assert (pair_change->new_uses.is_valid ());
+      set_pair_pat (pair_change);
+    }
+  else
+    {
+      change_strategy strategy[2];
+      def_info *stp_def = decide_stp_strategy (strategy, first, second,
+					       move_range);
+      if (dump_file)
+	{
+	  auto cs1 = cs_to_string (strategy[0]);
+	  auto cs2 = cs_to_string (strategy[1]);
+	  fprintf (dump_file,
+		   "  stp strategy for candidate insns (%d,%d): (%s,%s)\n",
+		   insns[0]->uid (), insns[1]->uid (), cs1, cs2);
+	  if (stp_def)
+	    fprintf (dump_file,
+		     "  re-using mem def from insn %d\n",
+		     stp_def->insn ()->uid ());
+	}
+
+      insn_change *change;
+      for (int i = 0; i < 2; i++)
+	{
+	  switch (strategy[i])
+	    {
+	    case DELETE:
+	      changes.quick_push (make_delete (insns[i]));
+	      break;
+	    case TOMBSTONE:
+	    case CHANGE:
+	      change = make_change (insns[i]);
+	      if (strategy[i] == CHANGE)
+		{
+		  set_pair_pat (change);
+		  change->new_uses = merge_access_arrays (attempt,
+							  input_uses[0],
+							  input_uses[1]);
+		  auto d1 = drop_memory_access (first->defs ());
+		  auto d2 = drop_memory_access (second->defs ());
+		  change->new_defs = merge_access_arrays (attempt, d1, d2);
+		  gcc_assert (stp_def);
+		  change->new_defs = insert_access (attempt,
+						    stp_def,
+						    change->new_defs);
+		  change->move_range = move_range;
+		  pair_change = change;
+		}
+	      else
+		{
+		  rtx_insn *rti = insns[i]->rtl ();
+		  gcc_assert (validate_change (rti, &PATTERN (rti),
+					       gen_tombstone (), true));
+		  gcc_assert (validate_change (rti, &REG_NOTES (rti),
+					       NULL_RTX, true));
+		  change->new_uses = use_array (nullptr, 0);
+		  have_tombstone_p = true;
+		}
+	      gcc_assert (change->new_defs.is_valid ());
+	      gcc_assert (change->new_uses.is_valid ());
+	      changes.quick_push (change);
+	      break;
+	    }
+	}
+
+      if (!stp_def)
+	{
+	  // Tricky case.  Cannot re-purpose existing insns for stp.
+	  // Need to insert new insn.
+	  if (dump_file)
+	    fprintf (dump_file,
+		     "  stp fusion: cannot re-purpose candidate stores\n");
+
+	  auto new_insn = crtl->ssa->create_insn (attempt, INSN, pair_pat);
+	  change = make_change (new_insn);
+	  change->move_range = move_range;
+	  change->new_uses = merge_access_arrays (attempt,
+						  input_uses[0],
+						  input_uses[1]);
+	  gcc_assert (change->new_uses.is_valid ());
+
+	  auto d1 = drop_memory_access (first->defs ());
+	  auto d2 = drop_memory_access (second->defs ());
+	  change->new_defs = merge_access_arrays (attempt, d1, d2);
+	  gcc_assert (change->new_defs.is_valid ());
+
+	  auto new_set = crtl->ssa->create_set (attempt, new_insn, memory);
+	  change->new_defs = insert_access (attempt, new_set,
+					    change->new_defs);
+	  gcc_assert (change->new_defs.is_valid ());
+	  changes.safe_insert (1, change);
+	  pair_change = change;
+	}
+    }
+
+  auto n_changes = changes.length ();
+  gcc_checking_assert (n_changes == 2 || n_changes == 3);
+
+  auto is_changing = insn_is_changing (changes);
+  for (unsigned i = 0; i < n_changes; i++)
+    gcc_assert (rtl_ssa::restrict_movement_ignoring (*changes[i], is_changing));
+
+  // Check the pair pattern is recog'd.
+  if (!rtl_ssa::recog_ignoring (attempt, *pair_change, is_changing))
+    {
+      if (dump_file)
+	fprintf (dump_file, "  failed to form pair, recog failed\n");
+
+      // Free any reg notes we allocated.
+      while (reg_notes)
+	{
+	  rtx next = XEXP (reg_notes, 1);
+	  free_EXPR_LIST_node (reg_notes);
+	  reg_notes = next;
+	}
+      cancel_changes (0);
+      return false;
+    }
+
+  gcc_assert (crtl->ssa->verify_insn_changes (changes));
+
+  confirm_change_group ();
+  crtl->ssa->change_insns (changes);
+  emitted_tombstone_p |= have_tombstone_p;
+  return true;
+}
+
+// Return true if STORE_INSN may modify mem rtx MEM.  Make sure we keep
+// within our BUDGET for alias analysis.
+static bool
+store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget)
+{
+  if (tombstone_insn_p (store_insn))
+    return false;
+
+  if (!budget)
+    {
+      if (dump_file)
+	{
+	  fprintf (dump_file,
+		   "exceeded budget, assuming store %d aliases with mem ",
+		   store_insn->uid ());
+	  print_simple_rtl (dump_file, mem);
+	  fprintf (dump_file, "\n");
+	}
+
+      return true;
+    }
+
+  budget--;
+  return memory_modified_in_insn_p (mem, store_insn->rtl ());
+}
+
+// Return true if LOAD may be modified by STORE.  Make sure we keep
+// within our BUDGET for alias analysis.
+static bool
+load_modified_by_store_p (insn_info *load,
+			  insn_info *store,
+			  int &budget)
+{
+  gcc_checking_assert (budget >= 0);
+
+  if (!budget)
+    {
+      if (dump_file)
+	{
+	  fprintf (dump_file,
+		   "exceeded budget, assuming load %d aliases with store %d\n",
+		   load->uid (), store->uid ());
+	}
+      return true;
+    }
+
+  // It isn't safe to re-order stores over calls.
+  if (CALL_P (load->rtl ()))
+    return true;
+
+  budget--;
+  return modified_in_p (PATTERN (load->rtl ()), store->rtl ());
+}
+
+struct alias_walker
+{
+  virtual insn_info *insn () const = 0;
+  virtual bool valid () const = 0;
+  virtual bool conflict_p (int &budget) const = 0;
+  virtual void advance () = 0;
+};
+
+template<bool reverse>
+class store_walker : public alias_walker
+{
+  using def_iter_t = typename std::conditional <reverse,
+	reverse_def_iterator, def_iterator>::type;
+
+  def_iter_t def_iter;
+  rtx cand_mem;
+  insn_info *limit;
+
+public:
+  store_walker (def_info *mem_def, rtx mem, insn_info *limit_insn) :
+    def_iter (mem_def), cand_mem (mem), limit (limit_insn) {}
+
+  bool valid () const override
+    {
+      if (!*def_iter)
+	return false;
+
+      if (reverse)
+	return *((*def_iter)->insn ()) > *limit;
+      else
+	return *((*def_iter)->insn ()) < *limit;
+    }
+  insn_info *insn () const override { return (*def_iter)->insn (); }
+  void advance () override { def_iter++; }
+  bool conflict_p (int &budget) const override
+  {
+    return store_modifies_mem_p (cand_mem, insn (), budget);
+  }
+};
+
+template<bool reverse>
+class load_walker : public alias_walker
+{
+  using def_iter_t = typename std::conditional <reverse,
+	reverse_def_iterator, def_iterator>::type;
+  using use_iter_t = typename std::conditional <reverse,
+	reverse_use_iterator, nondebug_insn_use_iterator>::type;
+
+  def_iter_t def_iter;
+  use_iter_t use_iter;
+  insn_info *cand_store;
+  insn_info *limit;
+
+  static use_info *start_use_chain (def_iter_t &def_iter)
+  {
+    set_info *set = nullptr;
+    for (; *def_iter; def_iter++)
+      {
+	set = dyn_cast <set_info *> (*def_iter);
+	if (!set)
+	  continue;
+
+	use_info *use = reverse
+	  ? set->last_nondebug_insn_use ()
+	  : set->first_nondebug_insn_use ();
+
+	if (use)
+	  return use;
+      }
+
+    return nullptr;
+  }
+
+public:
+  void advance () override
+  {
+    use_iter++;
+    if (*use_iter)
+      return;
+    def_iter++;
+    use_iter = start_use_chain (def_iter);
+  }
+
+  insn_info *insn () const override
+  {
+    gcc_checking_assert (*use_iter);
+    return (*use_iter)->insn ();
+  }
+
+  bool valid () const override
+  {
+    if (!*use_iter)
+      return false;
+
+    if (reverse)
+      return *((*use_iter)->insn ()) > *limit;
+    else
+      return *((*use_iter)->insn ()) < *limit;
+  }
+
+  bool conflict_p (int &budget) const override
+  {
+    return load_modified_by_store_p (insn (), cand_store, budget);
+  }
+
+  load_walker (def_info *def, insn_info *store, insn_info *limit_insn)
+    : def_iter (def), use_iter (start_use_chain (def_iter)),
+      cand_store (store), limit (limit_insn) {}
+};
+
+// Process our alias_walkers in a round-robin fashion, proceeding until
+// nothing more can be learned from alias analysis.
+//
+// We try to maintain the invariant that if a walker becomes invalid, we
+// set its pointer to null.
+static void
+do_alias_analysis (insn_info *alias_hazards[4],
+		   alias_walker *walkers[4],
+		   bool load_p)
+{
+  const int n_walkers = 2 + (2 * !load_p);
+  int budget = aarch64_ldp_alias_check_limit;
+
+  auto next_walker = [walkers,n_walkers](int current) -> int {
+    for (int j = 1; j <= n_walkers; j++)
+      {
+	int idx = (current + j) % n_walkers;
+	if (walkers[idx])
+	  return idx;
+      }
+    return -1;
+  };
+
+  int i = -1;
+  for (int j = 0; j < n_walkers; j++)
+    {
+      alias_hazards[j] = nullptr;
+      if (!walkers[j])
+	continue;
+
+      if (!walkers[j]->valid ())
+	walkers[j] = nullptr;
+      else if (i == -1)
+	i = j;
+    }
+
+  while (i >= 0)
+    {
+      int insn_i = i % 2;
+      int paired_i = (i & 2) + !insn_i;
+      int pair_fst = (i & 2);
+      int pair_snd = (i & 2) + 1;
+
+      if (walkers[i]->conflict_p (budget))
+	{
+	  alias_hazards[i] = walkers[i]->insn ();
+
+	  // We got an aliasing conflict for this {load,store} walker,
+	  // so we don't need to walk any further.
+	  walkers[i] = nullptr;
+
+	  // If we have a pair of alias conflicts that prevent
+	  // forming the pair, stop.  There's no need to do further
+	  // analysis.
+	  if (alias_hazards[paired_i]
+	      && (*alias_hazards[pair_fst] <= *alias_hazards[pair_snd]))
+	    return;
+
+	  if (!load_p)
+	    {
+	      int other_pair_fst = (pair_fst ? 0 : 2);
+	      int other_paired_i = other_pair_fst + !insn_i;
+
+	      int x_pair_fst = (i == pair_fst) ? i : other_paired_i;
+	      int x_pair_snd = (i == pair_fst) ? other_paired_i : i;
+
+	      // Similarly, handle the case where we have a {load,store}
+	      // or {store,load} alias hazard pair that prevents forming
+	      // the pair.
+	      if (alias_hazards[other_paired_i]
+		  && *alias_hazards[x_pair_fst] <= *alias_hazards[x_pair_snd])
+		return;
+	    }
+	}
+
+      if (walkers[i])
+	{
+	  walkers[i]->advance ();
+
+	  if (!walkers[i]->valid ())
+	    walkers[i] = nullptr;
+	}
+
+      i = next_walker (i);
+    }
+}
+
+static void
+get_viable_bases (insn_info *insns[2],
+		  vec <base_cand> &base_cands,
+		  rtx cand_mems[2],
+		  rtx reg_ops[2],
+		  unsigned access_size,
+		  bool reversed)
+{
+  // We discovered this pair through a common MEM_EXPR base.
+  // Need to ensure that we have a common base register def
+  // that is live at both locations.
+  def_info *base_defs[2] = {};
+  for (int i = 0; i < 2; i++)
+    {
+      const bool is_lower = (i == reversed);
+      poly_int64 poly_off;
+      rtx addr = strip_offset (XEXP (cand_mems[i], 0), &poly_off);
+
+      if (!REG_P (addr) || !poly_off.is_constant ())
+	continue;
+
+      // Punt on accesses relative to eliminable regs.  Since we don't know the
+      // elimination offset pre-RA, we should postpone forming pairs on such
+      // accesses until after RA.
+      if (!reload_completed
+	  && (REGNO (addr) == FRAME_POINTER_REGNUM
+	      || REGNO (addr) == ARG_POINTER_REGNUM))
+	continue;
+
+      HOST_WIDE_INT base_off = poly_off.to_constant ();
+
+      // It should be unlikely that we ever punt here, since MEM_EXPR offset
+      // alignment should be a good proxy for register offset alignment.
+      if (base_off % access_size != 0)
+	{
+	  if (dump_file)
+	    fprintf (dump_file,
+		     "MEM_EXPR offset aligned, reg offset unaligned "
+		     "(insn %d)\n",
+		     insns[i]->uid ());
+	  continue;
+	}
+
+      base_off /= access_size;
+
+      if (!is_lower)
+	base_off--;
+
+      if (base_off < LDP_MIN_IMM || base_off > LDP_MAX_IMM)
+	continue;
+
+      for (auto use : insns[i]->uses ())
+	if (use->is_reg () && use->regno () == REGNO (addr))
+	  {
+	    base_defs[i] = use->def ();
+	    break;
+	  }
+    }
+
+  if (!base_defs[0] && !base_defs[1])
+    {
+      if (dump_file)
+	fprintf (dump_file, "no viable base register for pair (%d,%d)\n",
+		 insns[0]->uid (), insns[1]->uid ());
+      return;
+    }
+
+  if (base_defs[0] == base_defs[1])
+    {
+      // Easy case: insns already share the same base reg def.
+      base_cands.quick_push (base_defs[0]);
+      return;
+    }
+
+  if (base_defs[0] && base_defs[1]
+      && base_defs[0]->regno () == base_defs[1]->regno ())
+    {
+      // Accesses see different versions of the same
+      // base register, i.e. the base register gets updated
+      // between the two accesses.
+      //
+      // For now, we punt on this, but TODO: we should try
+      // to use an auto-inc ldp/stp where possible here.
+      if (dump_file)
+	fprintf (dump_file, "punting on base register update (%d,%d)\n",
+		 insns[0]->uid (), insns[1]->uid ());
+      return;
+    }
+
+  for (int i = 0; i < 2; i++)
+    {
+      if (!base_defs[i])
+	continue;
+
+      // We already know that the offset is such that a valid pair insn
+      // can be formed, but given that we're changing a base, we need to
+      // check that we can safely adjust the mem to get a suitable
+      // paired mem while still satisfying "m".
+      const bool is_lower = (i == reversed);
+      rtx mem = cand_mems[i];
+      rtx other_mem = cand_mems[!i];
+      HOST_WIDE_INT adjust_amt = access_size;
+      if (!is_lower)
+	adjust_amt *= -1;
+      machine_mode this_mode = GET_MODE (mem);
+      machine_mode new_mode = GET_MODE (other_mem);
+      if (!try_adjust_address (mem, new_mode, adjust_amt))
+	{
+	  auto canon_mode = canonical_mode_for_mode (new_mode);
+	  if (canon_mode == new_mode)
+	    {
+	      if (dump_file)
+		fprintf (dump_file,
+			 "insn (%d): base not viable, can't adjust mem by %"
+			 PRId64 " (from %smode -> %smode)\n",
+			 insns[i]->uid (), adjust_amt,
+			 mode_name[this_mode], mode_name[new_mode]);
+	      continue;
+	    }
+
+	  if (!try_adjust_address (mem, canon_mode, adjust_amt))
+	    {
+
+	      if (dump_file)
+		fprintf (dump_file,
+			 "insn (%d): base not viable, can't adjust mem by %"
+			 PRId64 " (%s -> %s) or in canonical %smode\n",
+			 insns[i]->uid (), adjust_amt,
+			 mode_name[this_mode], mode_name[new_mode],
+			 mode_name[canon_mode]);
+	      continue;
+	    }
+
+	  rtx reg_op = lowpart_subreg (canon_mode, reg_ops[!i], new_mode);
+	  if (!reg_op)
+	    {
+	      if (dump_file)
+		fprintf (dump_file,
+			 "insn (%d): base not viable, can only adjust mem "
+			 "in %smode, but reg op can't be punned from %smode\n",
+			 insns[i]->uid (),
+			 mode_name[canon_mode], mode_name[new_mode]);
+	      continue;
+	    }
+
+	  if (dump_file)
+	    fprintf (dump_file,
+		     "insn (%d): can't adjust mem by %" PRId64
+		     " (%s -> %s) but can in (canonical) %smode\n",
+		     insns[i]->uid (), adjust_amt,
+		     mode_name[this_mode], mode_name[new_mode],
+		     mode_name[canon_mode]);
+	}
+
+      base_cands.quick_push (base_cand {base_defs[i], i});
+    }
+}
+
+// Given two adjacent memory accesses of the same size, I1 and I2, try
+// and see if we can merge them into a ldp or stp.
+static bool
+try_fuse_pair (bool load_p,
+	       unsigned access_size,
+	       insn_info *i1,
+	       insn_info *i2,
+	       base_info binfo,
+	       bool &emitted_tombstone_p)
+{
+  if (dump_file)
+    fprintf (dump_file, "analyzing pair (load=%d): (%d,%d)\n",
+	     load_p, i1->uid (), i2->uid ());
+
+  insn_info *insns[2];
+  bool reversed = false;
+  if (*i1 < *i2)
+    {
+      insns[0] = i1;
+      insns[1] = i2;
+    }
+  else
+    {
+      insns[0] = i2;
+      insns[1] = i1;
+      reversed = true;
+    }
+
+  rtx cand_mems[2];
+  rtx reg_ops[2];
+  for (int i = 0; i < 2; i++)
+    {
+      rtx pat = PATTERN (insns[i]->rtl ());
+      cand_mems[i] = XEXP (pat, load_p);
+      reg_ops[i] = XEXP (pat, !load_p);
+    }
+
+  if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1]))
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "punting on ldp due to reg conflcits (%d,%d)\n",
+		 insns[0]->uid (), insns[1]->uid ());
+      return false;
+    }
+
+  if (cfun->can_throw_non_call_exceptions
+      && insn_could_throw_p (insns[0]->rtl ())
+      && insn_could_throw_p (insns[1]->rtl ()))
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "can't combine insns with EH side effects (%d,%d)\n",
+		 insns[0]->uid (), insns[1]->uid ());
+      return false;
+    }
+
+  auto_vec <base_cand> base_cands;
+  base_cands.reserve (2);
+  if (binfo.is_reg ())
+    // Simple case: both accesses using the same base register def.
+    base_cands.quick_push (binfo.get_reg ());
+  else
+    {
+      get_viable_bases (insns, base_cands, cand_mems,
+			reg_ops, access_size, reversed);
+      if (base_cands.is_empty ())
+	return false;
+    }
+
+  for (auto use : insns[1]->uses ())
+    if (!use->is_mem () && use->def () && use->def ()->insn () == insns[0])
+      {
+	if (dump_file)
+	  fprintf (dump_file, "%d has true dependence on %d, rejecting pair\n",
+		   insns[1]->uid (), insns[0]->uid ());
+	return false;
+      }
+
+  unsigned i = 0;
+  while (i < base_cands.length ())
+    {
+      base_cand &cand = base_cands[i];
+
+      rtx *ignore[2] = {};
+      for (int j = 0; j < 2; j++)
+	if (cand.from_insn == -1 || cand.from_insn == !j)
+	  ignore[j] = &XEXP (cand_mems[j], 0);
+
+      insn_info *h = first_hazard_after (insns[0], ignore[0]);
+      if (h && *h <= *insns[1])
+	cand.hazards[0] = h;
+
+      h = latest_hazard_before (insns[1], ignore[1]);
+      if (h && *h >= *insns[0])
+	cand.hazards[1] = h;
+
+      if (!cand.viable ())
+	{
+	  if (dump_file)
+	    fprintf (dump_file,
+		     "pair (%d,%d): rejecting base %d due to dataflow "
+		     "hazards (%d,%d)\n",
+		     insns[0]->uid (),
+		     insns[1]->uid (),
+		     cand.m_def->regno (),
+		     cand.hazards[0]->uid (),
+		     cand.hazards[1]->uid ());
+
+	  base_cands.ordered_remove (i);
+	}
+      else
+	i++;
+    }
+
+  if (base_cands.is_empty ())
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "can't form pair (%d,%d) due to dataflow hazards\n",
+		 insns[0]->uid (), insns[1]->uid ());
+      return false;
+    }
+
+  insn_info *alias_hazards[4] = {};
+
+  // First def of memory after the first insn, and last def of memory
+  // before the second insn, respectively.
+  def_info *mem_defs[2] = {};
+  if (load_p)
+    {
+      if (!MEM_READONLY_P (cand_mems[0]))
+	{
+	  mem_defs[0] = memory_access (insns[0]->uses ())->def ();
+	  gcc_checking_assert (mem_defs[0]);
+	  mem_defs[0] = mem_defs[0]->next_def ();
+	}
+      if (!MEM_READONLY_P (cand_mems[1]))
+	{
+	  mem_defs[1] = memory_access (insns[1]->uses ())->def ();
+	  gcc_checking_assert (mem_defs[1]);
+	}
+    }
+  else
+    {
+      mem_defs[0] = memory_access (insns[0]->defs ())->next_def ();
+      mem_defs[1] = memory_access (insns[1]->defs ())->prev_def ();
+      gcc_checking_assert (mem_defs[0]);
+      gcc_checking_assert (mem_defs[1]);
+    }
+
+  store_walker<false> forward_store_walker (mem_defs[0],
+					    cand_mems[0],
+					    insns[1]);
+  store_walker<true> backward_store_walker (mem_defs[1],
+					    cand_mems[1],
+					    insns[0]);
+  alias_walker *walkers[4] = {};
+  if (mem_defs[0])
+    walkers[0] = &forward_store_walker;
+  if (mem_defs[1])
+    walkers[1] = &backward_store_walker;
+
+  if (load_p && (mem_defs[0] || mem_defs[1]))
+    do_alias_analysis (alias_hazards, walkers, load_p);
+  else
+    {
+      // We want to find any loads hanging off the first store.
+      mem_defs[0] = memory_access (insns[0]->defs ());
+      load_walker<false> forward_load_walker (mem_defs[0], insns[0], insns[1]);
+      load_walker<true> backward_load_walker (mem_defs[1], insns[1], insns[0]);
+      walkers[2] = &forward_load_walker;
+      walkers[3] = &backward_load_walker;
+      do_alias_analysis (alias_hazards, walkers, load_p);
+      // Now consolidate hazards back down.
+      if (alias_hazards[2]
+	  && (!alias_hazards[0] || (*alias_hazards[2] < *alias_hazards[0])))
+	alias_hazards[0] = alias_hazards[2];
+
+      if (alias_hazards[3]
+	  && (!alias_hazards[1] || (*alias_hazards[3] > *alias_hazards[1])))
+	alias_hazards[1] = alias_hazards[3];
+    }
+
+  if (alias_hazards[0] && alias_hazards[1]
+      && *alias_hazards[0] <= *alias_hazards[1])
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "cannot form pair (%d,%d) due to alias conflicts (%d,%d)\n",
+		 i1->uid (), i2->uid (),
+		 alias_hazards[0]->uid (), alias_hazards[1]->uid ());
+      return false;
+    }
+
+  // Now narrow the hazards on each base candidate using
+  // the alias hazards.
+  i = 0;
+  while (i < base_cands.length ())
+    {
+      base_cand &cand = base_cands[i];
+      if (alias_hazards[0] && (!cand.hazards[0]
+			       || *alias_hazards[0] < *cand.hazards[0]))
+	cand.hazards[0] = alias_hazards[0];
+      if (alias_hazards[1] && (!cand.hazards[1]
+			       || *alias_hazards[1] > *cand.hazards[1]))
+	cand.hazards[1] = alias_hazards[1];
+
+      if (cand.viable ())
+	i++;
+      else
+	{
+	  if (dump_file)
+	    fprintf (dump_file, "pair (%d,%d): rejecting base %d due to "
+				"alias/dataflow hazards (%d,%d)",
+				insns[0]->uid (), insns[1]->uid (),
+				cand.m_def->regno (),
+				cand.hazards[0]->uid (),
+				cand.hazards[1]->uid ());
+
+	  base_cands.ordered_remove (i);
+	}
+    }
+
+  if (base_cands.is_empty ())
+    {
+      if (dump_file)
+	fprintf (dump_file,
+		 "cannot form pair (%d,%d) due to alias/dataflow hazards",
+		 insns[0]->uid (), insns[1]->uid ());
+
+      return false;
+    }
+
+  base_cand *base = &base_cands[0];
+  if (base_cands.length () > 1)
+    {
+      // If there are still multiple viable bases, it makes sense
+      // to choose one that allows us to reduce register pressure,
+      // for loads this means moving further down, for stores this
+      // means moving further up.
+      gcc_checking_assert (base_cands.length () == 2);
+      const int hazard_i = !load_p;
+      if (base->hazards[hazard_i])
+	{
+	  if (!base_cands[1].hazards[hazard_i])
+	    base = &base_cands[1];
+	  else if (load_p
+		   && *base_cands[1].hazards[hazard_i]
+		      > *(base->hazards[hazard_i]))
+	    base = &base_cands[1];
+	  else if (!load_p
+		   && *base_cands[1].hazards[hazard_i]
+		      < *(base->hazards[hazard_i]))
+	    base = &base_cands[1];
+	}
+    }
+
+  // Otherwise, hazards[0] > hazards[1].
+  // Pair can be formed anywhere in (hazards[1], hazards[0]).
+  insn_range_info range (insns[0], insns[1]);
+  if (base->hazards[1])
+    range.first = base->hazards[1];
+  if (base->hazards[0])
+    range.last = base->hazards[0]->prev_nondebug_insn ();
+
+  // Placement strategy: push loads down and pull stores up, this should
+  // help register pressure by reducing live ranges.
+  if (load_p)
+    range.first = range.last;
+  else
+    range.last = range.first;
+
+  if (dump_file)
+    {
+      auto print_hazard = [](insn_info *i)
+	{
+	  if (i)
+	    fprintf (dump_file, "%d", i->uid ());
+	  else
+	    fprintf (dump_file, "-");
+	};
+      auto print_pair = [print_hazard](insn_info **i)
+	{
+	  print_hazard (i[0]);
+	  fprintf (dump_file, ",");
+	  print_hazard (i[1]);
+	};
+
+      fprintf (dump_file, "fusing pair [L=%d] (%d,%d), base=%d, hazards: (",
+	      load_p, insns[0]->uid (), insns[1]->uid (),
+	      base->m_def->regno ());
+      print_pair (base->hazards);
+      fprintf (dump_file, "), move_range: (%d,%d)\n",
+	       range.first->uid (), range.last->uid ());
+    }
+
+  return fuse_pair (load_p, i1, i2, *base, range, emitted_tombstone_p);
+}
+
+// Erase [l.begin (), i] inclusive, respecting iterator order.
+static insn_iter_t
+erase_prefix (insn_list_t &l, insn_iter_t i)
+{
+  l.erase (l.begin (), std::next (i));
+  return l.begin ();
+}
+
+static insn_iter_t
+erase_one (insn_list_t &l, insn_iter_t i, insn_iter_t begin)
+{
+  auto prev_or_next = (i == begin) ? std::next (i) : std::prev (i);
+  l.erase (i);
+  return prev_or_next;
+}
+
+static void
+dump_insn_list (FILE *f, const insn_list_t &l)
+{
+  fprintf (f, "(");
+
+  auto i = l.begin ();
+  auto end = l.end ();
+
+  if (i != end)
+    fprintf (f, "%d", (*i)->uid ());
+  i++;
+
+  for (; i != end; i++)
+    {
+      fprintf (f, ", %d", (*i)->uid ());
+    }
+
+  fprintf (f, ")");
+}
+
+DEBUG_FUNCTION void
+debug (const insn_list_t &l)
+{
+  dump_insn_list (stderr, l);
+  fprintf (stderr, "\n");
+}
+
+void
+merge_pairs (insn_iter_t l_begin,
+	     insn_iter_t l_end,
+	     insn_iter_t r_begin,
+	     insn_iter_t r_end,
+	     insn_list_t &left_list,
+	     insn_list_t &right_list,
+	     hash_set <insn_info *> &to_delete,
+	     bool load_p,
+	     unsigned access_size,
+	     base_info binfo,
+	     bool &emitted_tombstone_p)
+{
+  auto iter_l = l_begin;
+  auto iter_r = r_begin;
+
+  bool result;
+  while (l_begin != l_end && r_begin != r_end)
+    {
+      auto next_l = std::next (iter_l);
+      auto next_r = std::next (iter_r);
+      if (**iter_l < **iter_r
+	  && next_l != l_end
+	  && **next_l < **iter_r)
+	{
+	  iter_l = next_l;
+	  continue;
+	}
+      else if (**iter_r < **iter_l
+	       && next_r != r_end
+	       && **next_r < **iter_l)
+	{
+	  iter_r = next_r;
+	  continue;
+	}
+
+      bool update_l = false;
+      bool update_r = false;
+
+      result = try_fuse_pair (load_p, access_size,
+			      *iter_l, *iter_r, binfo,
+			      emitted_tombstone_p);
+      if (result)
+	{
+	  update_l = update_r = true;
+	  if (to_delete.add (*iter_r))
+	    gcc_unreachable (); // Shouldn't get added twice.
+
+	  iter_l = erase_one (left_list, iter_l, l_begin);
+	  iter_r = erase_one (right_list, iter_r, r_begin);
+	}
+      else
+	{
+	  // Here we know that the entire prefix we skipped
+	  // over cannot merge with anything further on
+	  // in iteration order (there are aliasing hazards
+	  // on both sides), so delete the entire prefix.
+	  if (**iter_l < **iter_r)
+	    {
+	      // Delete everything from l_begin to iter_l, inclusive.
+	      update_l = true;
+	      iter_l = erase_prefix (left_list, iter_l);
+	    }
+	  else
+	    {
+	      // Delete everything from r_begin to iter_r, inclusive.
+	      update_r = true;
+	      iter_r = erase_prefix (right_list, iter_r);
+	    }
+	}
+
+      if (update_l)
+	{
+	  l_begin = left_list.begin ();
+	  l_end = left_list.end ();
+	}
+      if (update_r)
+	{
+	  r_begin = right_list.begin ();
+	  r_end = right_list.end ();
+	}
+    }
+}
+
+// Given a list of insns LEFT_ORIG with all accesses adjacent to
+// those in RIGHT_ORIG, try and form them into pairs.
+//
+// Return true iff we formed all the RIGHT_ORIG candidates into
+// pairs.
+bool
+ldp_bb_info::try_form_pairs (insn_list_t *left_orig,
+			     insn_list_t *right_orig,
+			     bool load_p, unsigned access_size,
+			     base_info binfo)
+{
+  // Make a copy of the right list which we can modify to
+  // exclude candidates locally for this invocation.
+  insn_list_t right_copy (*right_orig);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "try_form_pairs [L=%d], cand vecs ", load_p);
+      dump_insn_list (dump_file, *left_orig);
+      fprintf (dump_file, " x ");
+      dump_insn_list (dump_file, right_copy);
+      fprintf (dump_file, "\n");
+    }
+
+  // List of candidate insns to delete from the original right_list
+  // (because they were formed into a pair).
+  hash_set <insn_info *> to_delete;
+
+  // Now we have a 2D matrix of candidates, traverse it to try and
+  // find a pair of insns that are already adjacent (within the
+  // merged list of accesses).
+  merge_pairs (left_orig->begin (), left_orig->end (),
+	       right_copy.begin (), right_copy.end (),
+	       *left_orig, right_copy,
+	       to_delete, load_p, access_size, binfo,
+	       m_emitted_tombstone);
+
+  // If we formed all right candidates into pairs,
+  // then we can skip the next iteration.
+  if (to_delete.elements () == right_orig->size ())
+    return true;
+
+  // Delete items from to_delete.
+  auto right_iter = right_orig->begin ();
+  auto right_end = right_orig->end ();
+  while (right_iter != right_end)
+    {
+      auto right_next = std::next (right_iter);
+
+      if (to_delete.contains (*right_iter))
+	{
+	  right_orig->erase (right_iter);
+	  right_end = right_orig->end ();
+	}
+
+      right_iter = right_next;
+    }
+
+  return false;
+}
+
+void
+ldp_bb_info::transform_for_base (base_info binfo,
+				 int encoded_lfs,
+				 access_group &group)
+{
+  const auto lfs = decode_lfs (encoded_lfs);
+  const unsigned access_size = lfs.size;
+
+  bool skip_next = true;
+  access_record *prev_access = nullptr;
+
+  for (auto &access : group.list)
+    {
+      if (skip_next)
+	skip_next = false;
+      else if (known_eq (access.offset, prev_access->offset + access_size))
+	skip_next = try_form_pairs (&prev_access->cand_insns,
+				    &access.cand_insns,
+				    lfs.load_p, access_size, binfo);
+
+      prev_access = &access;
+    }
+}
+
+void
+ldp_bb_info::cleanup_tombstones ()
+{
+  // No need to do anything if we didn't emit a tombstone insn for this bb.
+  if (!m_emitted_tombstone)
+    return;
+
+  insn_info *insn = m_bb->head_insn ();
+  while (insn)
+    {
+      insn_info *next = insn->next_nondebug_insn ();
+      if (!insn->is_real () || !tombstone_insn_p (insn))
+	{
+	  insn = next;
+	  continue;
+	}
+
+      auto def = memory_access (insn->defs ());
+      auto set = dyn_cast <set_info *> (def);
+      if (set && set->has_any_uses ())
+	{
+	  def_info *prev_def = def->prev_def ();
+	  auto prev_set = dyn_cast <set_info *> (prev_def);
+	  if (!prev_set)
+	    gcc_unreachable (); // TODO: handle this if needed.
+
+	  while (set->first_use ())
+	    crtl->ssa->reparent_use (set->first_use (), prev_set);
+	}
+
+      // Now set has no uses, we can delete it.
+      insn_change change (insn, insn_change::DELETE);
+      crtl->ssa->change_insn (change);
+      insn = next;
+    }
+}
+
+template<typename Map>
+void
+ldp_bb_info::traverse_base_map (Map &map)
+{
+  for (auto kv : map)
+    {
+      const auto &key = kv.first;
+      auto &value = kv.second;
+      const base_info binfo (key.first);
+      transform_for_base (binfo, key.second, value);
+    }
+}
+
+void
+ldp_bb_info::transform ()
+{
+  traverse_base_map (expr_map);
+  traverse_base_map (def_map);
+}
+
+static void
+ldp_fusion_init ()
+{
+  calculate_dominance_info (CDI_DOMINATORS);
+  df_analyze ();
+  crtl->ssa = new rtl_ssa::function_info (cfun);
+}
+
+static void
+ldp_fusion_destroy ()
+{
+  if (crtl->ssa->perform_pending_updates ())
+    cleanup_cfg (0);
+
+  free_dominance_info (CDI_DOMINATORS);
+
+  delete crtl->ssa;
+  crtl->ssa = nullptr;
+}
+
+void ldp_fusion_bb (bb_info *bb)
+{
+  const bool track_loads
+    = aarch64_tune_params.ldp_policy_model != AARCH64_LDP_STP_POLICY_NEVER;
+  const bool track_stores
+    = aarch64_tune_params.stp_policy_model != AARCH64_LDP_STP_POLICY_NEVER;
+
+  ldp_bb_info bb_state (bb);
+
+  for (auto insn : bb->nondebug_insns ())
+    {
+      rtx_insn *rti = insn->rtl ();
+
+      if (!rti || !INSN_P (rti))
+	continue;
+
+      rtx pat = PATTERN (rti);
+
+      if (GET_CODE (pat) != SET)
+	continue;
+
+      if (track_stores && MEM_P (XEXP (pat, 0)))
+	bb_state.track_access (insn, false, XEXP (pat, 0));
+      else if (track_loads && MEM_P (XEXP (pat, 1)))
+	bb_state.track_access (insn, true, XEXP (pat, 1));
+    }
+
+  bb_state.transform ();
+  bb_state.cleanup_tombstones ();
+}
+
+void ldp_fusion ()
+{
+  ldp_fusion_init ();
+
+  for (auto bb : crtl->ssa->bbs ())
+    ldp_fusion_bb (bb);
+
+  ldp_fusion_destroy ();
+}
+
+namespace {
+
+const pass_data pass_data_ldp_fusion =
+{
+  RTL_PASS, /* type */
+  "ldp_fusion", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_ldp_fusion : public rtl_opt_pass
+{
+public:
+  pass_ldp_fusion (gcc::context *ctx)
+    : rtl_opt_pass (pass_data_ldp_fusion, ctx)
+    {}
+
+  opt_pass *clone () override { return new pass_ldp_fusion (m_ctxt); }
+
+  bool gate (function *) final override
+    {
+      if (!optimize || optimize_debug)
+	return false;
+
+      // If the tuning policy says never to form ldps or stps, don't run
+      // the pass.
+      if ((aarch64_tune_params.ldp_policy_model
+	   == AARCH64_LDP_STP_POLICY_NEVER)
+	  && (aarch64_tune_params.stp_policy_model
+	      == AARCH64_LDP_STP_POLICY_NEVER))
+	return false;
+
+      if (reload_completed)
+	return flag_aarch64_late_ldp_fusion;
+      else
+	return flag_aarch64_early_ldp_fusion;
+    }
+
+  unsigned execute (function *) final override
+    {
+      ldp_fusion ();
+      return 0;
+    }
+};
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_ldp_fusion (gcc::context *ctx)
+{
+  return new pass_ldp_fusion (ctx);
+}
+
+#include "gt-aarch64-ldp-fusion.h"
diff --git a/gcc/config/aarch64/aarch64-passes.def b/gcc/config/aarch64/aarch64-passes.def
index 6ace797b738..f38c642414e 100644
--- a/gcc/config/aarch64/aarch64-passes.def
+++ b/gcc/config/aarch64/aarch64-passes.def
@@ -23,3 +23,5 @@ INSERT_PASS_BEFORE (pass_reorder_blocks, 1, pass_track_speculation);
 INSERT_PASS_AFTER (pass_machine_reorg, 1, pass_tag_collision_avoidance);
 INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_bti);
 INSERT_PASS_AFTER (pass_if_after_combine, 1, pass_cc_fusion);
+INSERT_PASS_BEFORE (pass_early_remat, 1, pass_ldp_fusion);
+INSERT_PASS_BEFORE (pass_peephole2, 1, pass_ldp_fusion);
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 60a55f4bc19..1ccc516622b 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1049,6 +1049,7 @@ rtl_opt_pass *make_pass_track_speculation (gcc::context *);
 rtl_opt_pass *make_pass_tag_collision_avoidance (gcc::context *);
 rtl_opt_pass *make_pass_insert_bti (gcc::context *ctxt);
 rtl_opt_pass *make_pass_cc_fusion (gcc::context *ctxt);
+rtl_opt_pass *make_pass_ldp_fusion (gcc::context *);
 
 poly_uint64 aarch64_regmode_natural_size (machine_mode);
 
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index f5a518202a1..5d63b2ec8ac 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -271,6 +271,16 @@ mtrack-speculation
 Target Var(aarch64_track_speculation)
 Generate code to track when the CPU might be speculating incorrectly.
 
+mearly-ldp-fusion
+Target Var(flag_aarch64_early_ldp_fusion) Optimization Init(1)
+Enable the pre-RA AArch64-specific pass to fuse loads and stores into
+ldp and stp instructions.
+
+mlate-ldp-fusion
+Target Var(flag_aarch64_late_ldp_fusion) Optimization Init(1)
+Enable the post-RA AArch64-specific pass to fuse loads and stores into
+ldp and stp instructions.
+
 mstack-protector-guard=
 Target RejectNegative Joined Enum(stack_protector_guard) Var(aarch64_stack_protector_guard) Init(SSP_GLOBAL)
 Use given stack-protector guard.
@@ -360,3 +370,13 @@ Enum(aarch64_ldp_stp_policy) String(never) Value(AARCH64_LDP_STP_POLICY_NEVER)
 
 EnumValue
 Enum(aarch64_ldp_stp_policy) String(aligned) Value(AARCH64_LDP_STP_POLICY_ALIGNED)
+
+-param=aarch64-ldp-alias-check-limit=
+Target Joined UInteger Var(aarch64_ldp_alias_check_limit) Init(8) IntegerRange(0, 65536) Param
+Limit on number of alias checks performed when attempting to form an ldp/stp.
+
+-param=aarch64-ldp-canonicalize-modes=
+Target Joined UInteger Var(aarch64_ldp_canonicalize_modes) Init(0) IntegerRange(0,1) Param
+Debugging param to change the strategy for adjusting modes when forming load
+and store pairs.  When set to 0, we only ensure modes agree on VECTOR_MODE_P.
+When set to 1, we canonicalize to a single mode for a given size.
diff --git a/gcc/config/aarch64/t-aarch64 b/gcc/config/aarch64/t-aarch64
index a9a244ab6d6..37917344a54 100644
--- a/gcc/config/aarch64/t-aarch64
+++ b/gcc/config/aarch64/t-aarch64
@@ -176,6 +176,13 @@ aarch64-cc-fusion.o: $(srcdir)/config/aarch64/aarch64-cc-fusion.cc \
 	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
 		$(srcdir)/config/aarch64/aarch64-cc-fusion.cc
 
+aarch64-ldp-fusion.o: $(srcdir)/config/aarch64/aarch64-ldp-fusion.cc \
+    $(CONFIG_H) $(SYSTEM_H) $(CORETYPES_H) $(BACKEND_H) $(RTL_H) $(DF_H) \
+    $(RTL_SSA_H) cfgcleanup.h tree-pass.h ordered-hash-map.h tree-dfa.h \
+    fold-const.h tree-hash-traits.h print-tree.h
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/aarch64/aarch64-ldp-fusion.cc
+
 comma=,
 MULTILIB_OPTIONS    = $(subst $(comma),/, $(patsubst %, mabi=%, $(subst $(comma),$(comma)mabi=,$(TM_MULTILIB_CONFIG))))
 MULTILIB_DIRNAMES   = $(subst $(comma), ,$(TM_MULTILIB_CONFIG))
  

Patch

diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc b/gcc/config/aarch64/aarch64-ldp-fusion.cc
index e5de4bbb3f5..f1621c9a384 100644
--- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -148,7 +148,7 @@  struct ldp_bb_info
   static const size_t obstack_alignment = sizeof (void *);
   bb_info *m_bb;
 
-  ldp_bb_info (bb_info *bb) : m_bb (bb)
+  ldp_bb_info (bb_info *bb) : m_bb (bb), m_emitted_tombstone (false)
   {
     obstack_specify_allocation (&m_obstack, OBSTACK_CHUNK_SIZE,
 				obstack_alignment, obstack_chunk_alloc,
@@ -164,7 +164,10 @@  struct ldp_bb_info
   inline void cleanup_tombstones ();
 
 private:
+  // Did we emit a tombstone insn for this bb?
+  bool m_emitted_tombstone;
   obstack m_obstack;
+
   inline splay_tree_node<access_record *> *node_alloc (access_record *);
 
   template<typename Map>
@@ -1006,7 +1009,8 @@  fuse_pair (bool load_p,
 	   insn_info *i1,
 	   insn_info *i2,
 	   base_cand &base,
-	   const insn_range_info &move_range)
+	   const insn_range_info &move_range,
+	   bool &emitted_tombstone_p)
 {
   auto attempt = crtl->ssa->new_change_attempt ();
 
@@ -1021,6 +1025,9 @@  fuse_pair (bool load_p,
 						    insn_change::DELETE);
     };
 
+  // Are we using a tombstone insn for this pair?
+  bool have_tombstone_p = false;
+
   insn_info *first = (*i1 < *i2) ? i1 : i2;
   insn_info *second = (first == i1) ? i2 : i1;
 
@@ -1217,6 +1224,7 @@  fuse_pair (bool load_p,
 		  gcc_assert (validate_change (rti, &REG_NOTES (rti),
 					       NULL_RTX, true));
 		  change->new_uses = use_array (nullptr, 0);
+		  have_tombstone_p = true;
 		}
 	      gcc_assert (change->new_defs.is_valid ());
 	      gcc_assert (change->new_uses.is_valid ());
@@ -1283,6 +1291,7 @@  fuse_pair (bool load_p,
 
   confirm_change_group ();
   crtl->ssa->change_insns (changes);
+  emitted_tombstone_p |= have_tombstone_p;
   return true;
 }
 
@@ -1702,7 +1711,8 @@  try_fuse_pair (bool load_p,
 	       unsigned access_size,
 	       insn_info *i1,
 	       insn_info *i2,
-	       base_info binfo)
+	       base_info binfo,
+	       bool &emitted_tombstone_p)
 {
   if (dump_file)
     fprintf (dump_file, "analyzing pair (load=%d): (%d,%d)\n",
@@ -1991,7 +2001,7 @@  try_fuse_pair (bool load_p,
 	       range.first->uid (), range.last->uid ());
     }
 
-  return fuse_pair (load_p, i1, i2, *base, range);
+  return fuse_pair (load_p, i1, i2, *base, range, emitted_tombstone_p);
 }
 
 // Erase [l.begin (), i] inclusive, respecting iterator order.
@@ -2047,7 +2057,8 @@  merge_pairs (insn_iter_t l_begin,
 	     hash_set <insn_info *> &to_delete,
 	     bool load_p,
 	     unsigned access_size,
-	     base_info binfo)
+	     base_info binfo,
+	     bool &emitted_tombstone_p)
 {
   auto iter_l = l_begin;
   auto iter_r = r_begin;
@@ -2076,7 +2087,8 @@  merge_pairs (insn_iter_t l_begin,
       bool update_r = false;
 
       result = try_fuse_pair (load_p, access_size,
-			      *iter_l, *iter_r, binfo);
+			      *iter_l, *iter_r, binfo,
+			      emitted_tombstone_p);
       if (result)
 	{
 	  update_l = update_r = true;
@@ -2153,7 +2165,8 @@  ldp_bb_info::try_form_pairs (insn_list_t *left_orig,
   merge_pairs (left_orig->begin (), left_orig->end (),
 	       right_copy.begin (), right_copy.end (),
 	       *left_orig, right_copy,
-	       to_delete, load_p, access_size, binfo);
+	       to_delete, load_p, access_size, binfo,
+	       m_emitted_tombstone);
 
   // If we formed all right candidates into pairs,
   // then we can skip the next iteration.
@@ -2206,6 +2219,10 @@  ldp_bb_info::transform_for_base (base_info binfo,
 void
 ldp_bb_info::cleanup_tombstones ()
 {
+  // No need to do anything if we didn't emit a tombstone insn for this bb.
+  if (!m_emitted_tombstone)
+    return;
+
   insn_info *insn = m_bb->head_insn ();
   while (insn)
     {