Home | History | Annotate | Download | only in AArch64
      1 //=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file contains a pass that performs load / store related peephole
     11 // optimizations. This pass should be run after register allocation.
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "AArch64InstrInfo.h"
     16 #include "AArch64Subtarget.h"
     17 #include "MCTargetDesc/AArch64AddressingModes.h"
     18 #include "llvm/ADT/BitVector.h"
     19 #include "llvm/ADT/Statistic.h"
     20 #include "llvm/CodeGen/MachineBasicBlock.h"
     21 #include "llvm/CodeGen/MachineFunctionPass.h"
     22 #include "llvm/CodeGen/MachineInstr.h"
     23 #include "llvm/CodeGen/MachineInstrBuilder.h"
     24 #include "llvm/Support/CommandLine.h"
     25 #include "llvm/Support/Debug.h"
     26 #include "llvm/Support/ErrorHandling.h"
     27 #include "llvm/Support/raw_ostream.h"
     28 #include "llvm/Target/TargetInstrInfo.h"
     29 #include "llvm/Target/TargetMachine.h"
     30 #include "llvm/Target/TargetRegisterInfo.h"
     31 using namespace llvm;
     32 
     33 #define DEBUG_TYPE "aarch64-ldst-opt"
     34 
     35 /// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine
     36 /// load / store instructions to form ldp / stp instructions.
     37 
     38 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
     39 STATISTIC(NumPostFolded, "Number of post-index updates folded");
     40 STATISTIC(NumPreFolded, "Number of pre-index updates folded");
     41 STATISTIC(NumUnscaledPairCreated,
     42           "Number of load/store from unscaled generated");
     43 
     44 static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit",
     45                                    cl::init(20), cl::Hidden);
     46 
     47 // Place holder while testing unscaled load/store combining
     48 static cl::opt<bool> EnableAArch64UnscaledMemOp(
     49     "aarch64-unscaled-mem-op", cl::Hidden,
     50     cl::desc("Allow AArch64 unscaled load/store combining"), cl::init(true));
     51 
     52 namespace {
     53 struct AArch64LoadStoreOpt : public MachineFunctionPass {
     54   static char ID;
     55   AArch64LoadStoreOpt() : MachineFunctionPass(ID) {}
     56 
     57   const AArch64InstrInfo *TII;
     58   const TargetRegisterInfo *TRI;
     59 
     60   // Scan the instructions looking for a load/store that can be combined
     61   // with the current instruction into a load/store pair.
     62   // Return the matching instruction if one is found, else MBB->end().
     63   // If a matching instruction is found, MergeForward is set to true if the
     64   // merge is to remove the first instruction and replace the second with
     65   // a pair-wise insn, and false if the reverse is true.
     66   // \p SExtIdx[out] gives the index of the result of the load pair that
     67   // must be extended. The value of SExtIdx assumes that the paired load
     68   // produces the value in this order: (I, returned iterator), i.e.,
     69   // -1 means no value has to be extended, 0 means I, and 1 means the
     70   // returned iterator.
     71   MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
     72                                                bool &MergeForward, int &SExtIdx,
     73                                                unsigned Limit);
     74   // Merge the two instructions indicated into a single pair-wise instruction.
     75   // If MergeForward is true, erase the first instruction and fold its
     76   // operation into the second. If false, the reverse. Return the instruction
     77   // following the first instruction (which may change during processing).
     78   // \p SExtIdx index of the result that must be extended for a paired load.
     79   // -1 means none, 0 means I, and 1 means Paired.
     80   MachineBasicBlock::iterator
     81   mergePairedInsns(MachineBasicBlock::iterator I,
     82                    MachineBasicBlock::iterator Paired, bool MergeForward,
     83                    int SExtIdx);
     84 
     85   // Scan the instruction list to find a base register update that can
     86   // be combined with the current instruction (a load or store) using
     87   // pre or post indexed addressing with writeback. Scan forwards.
     88   MachineBasicBlock::iterator
     89   findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit,
     90                                 int Value);
     91 
     92   // Scan the instruction list to find a base register update that can
     93   // be combined with the current instruction (a load or store) using
     94   // pre or post indexed addressing with writeback. Scan backwards.
     95   MachineBasicBlock::iterator
     96   findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
     97 
     98   // Merge a pre-index base register update into a ld/st instruction.
     99   MachineBasicBlock::iterator
    100   mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
    101                         MachineBasicBlock::iterator Update);
    102 
    103   // Merge a post-index base register update into a ld/st instruction.
    104   MachineBasicBlock::iterator
    105   mergePostIdxUpdateInsn(MachineBasicBlock::iterator I,
    106                          MachineBasicBlock::iterator Update);
    107 
    108   bool optimizeBlock(MachineBasicBlock &MBB);
    109 
    110   bool runOnMachineFunction(MachineFunction &Fn) override;
    111 
    112   const char *getPassName() const override {
    113     return "AArch64 load / store optimization pass";
    114   }
    115 
    116 private:
    117   int getMemSize(MachineInstr *MemMI);
    118 };
    119 char AArch64LoadStoreOpt::ID = 0;
    120 } // namespace
    121 
    122 static bool isUnscaledLdst(unsigned Opc) {
    123   switch (Opc) {
    124   default:
    125     return false;
    126   case AArch64::STURSi:
    127     return true;
    128   case AArch64::STURDi:
    129     return true;
    130   case AArch64::STURQi:
    131     return true;
    132   case AArch64::STURWi:
    133     return true;
    134   case AArch64::STURXi:
    135     return true;
    136   case AArch64::LDURSi:
    137     return true;
    138   case AArch64::LDURDi:
    139     return true;
    140   case AArch64::LDURQi:
    141     return true;
    142   case AArch64::LDURWi:
    143     return true;
    144   case AArch64::LDURXi:
    145     return true;
    146   case AArch64::LDURSWi:
    147     return true;
    148   }
    149 }
    150 
    151 // Size in bytes of the data moved by an unscaled load or store
    152 int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) {
    153   switch (MemMI->getOpcode()) {
    154   default:
    155     llvm_unreachable("Opcode has unknown size!");
    156   case AArch64::STRSui:
    157   case AArch64::STURSi:
    158     return 4;
    159   case AArch64::STRDui:
    160   case AArch64::STURDi:
    161     return 8;
    162   case AArch64::STRQui:
    163   case AArch64::STURQi:
    164     return 16;
    165   case AArch64::STRWui:
    166   case AArch64::STURWi:
    167     return 4;
    168   case AArch64::STRXui:
    169   case AArch64::STURXi:
    170     return 8;
    171   case AArch64::LDRSui:
    172   case AArch64::LDURSi:
    173     return 4;
    174   case AArch64::LDRDui:
    175   case AArch64::LDURDi:
    176     return 8;
    177   case AArch64::LDRQui:
    178   case AArch64::LDURQi:
    179     return 16;
    180   case AArch64::LDRWui:
    181   case AArch64::LDURWi:
    182     return 4;
    183   case AArch64::LDRXui:
    184   case AArch64::LDURXi:
    185     return 8;
    186   case AArch64::LDRSWui:
    187   case AArch64::LDURSWi:
    188     return 4;
    189   }
    190 }
    191 
    192 static unsigned getMatchingNonSExtOpcode(unsigned Opc,
    193                                          bool *IsValidLdStrOpc = nullptr) {
    194   if (IsValidLdStrOpc)
    195     *IsValidLdStrOpc = true;
    196   switch (Opc) {
    197   default:
    198     if (IsValidLdStrOpc)
    199       *IsValidLdStrOpc = false;
    200     return UINT_MAX;
    201   case AArch64::STRDui:
    202   case AArch64::STURDi:
    203   case AArch64::STRQui:
    204   case AArch64::STURQi:
    205   case AArch64::STRWui:
    206   case AArch64::STURWi:
    207   case AArch64::STRXui:
    208   case AArch64::STURXi:
    209   case AArch64::LDRDui:
    210   case AArch64::LDURDi:
    211   case AArch64::LDRQui:
    212   case AArch64::LDURQi:
    213   case AArch64::LDRWui:
    214   case AArch64::LDURWi:
    215   case AArch64::LDRXui:
    216   case AArch64::LDURXi:
    217   case AArch64::STRSui:
    218   case AArch64::STURSi:
    219   case AArch64::LDRSui:
    220   case AArch64::LDURSi:
    221     return Opc;
    222   case AArch64::LDRSWui:
    223     return AArch64::LDRWui;
    224   case AArch64::LDURSWi:
    225     return AArch64::LDURWi;
    226   }
    227 }
    228 
    229 static unsigned getMatchingPairOpcode(unsigned Opc) {
    230   switch (Opc) {
    231   default:
    232     llvm_unreachable("Opcode has no pairwise equivalent!");
    233   case AArch64::STRSui:
    234   case AArch64::STURSi:
    235     return AArch64::STPSi;
    236   case AArch64::STRDui:
    237   case AArch64::STURDi:
    238     return AArch64::STPDi;
    239   case AArch64::STRQui:
    240   case AArch64::STURQi:
    241     return AArch64::STPQi;
    242   case AArch64::STRWui:
    243   case AArch64::STURWi:
    244     return AArch64::STPWi;
    245   case AArch64::STRXui:
    246   case AArch64::STURXi:
    247     return AArch64::STPXi;
    248   case AArch64::LDRSui:
    249   case AArch64::LDURSi:
    250     return AArch64::LDPSi;
    251   case AArch64::LDRDui:
    252   case AArch64::LDURDi:
    253     return AArch64::LDPDi;
    254   case AArch64::LDRQui:
    255   case AArch64::LDURQi:
    256     return AArch64::LDPQi;
    257   case AArch64::LDRWui:
    258   case AArch64::LDURWi:
    259     return AArch64::LDPWi;
    260   case AArch64::LDRXui:
    261   case AArch64::LDURXi:
    262     return AArch64::LDPXi;
    263   case AArch64::LDRSWui:
    264   case AArch64::LDURSWi:
    265     return AArch64::LDPSWi;
    266   }
    267 }
    268 
    269 static unsigned getPreIndexedOpcode(unsigned Opc) {
    270   switch (Opc) {
    271   default:
    272     llvm_unreachable("Opcode has no pre-indexed equivalent!");
    273   case AArch64::STRSui:
    274     return AArch64::STRSpre;
    275   case AArch64::STRDui:
    276     return AArch64::STRDpre;
    277   case AArch64::STRQui:
    278     return AArch64::STRQpre;
    279   case AArch64::STRWui:
    280     return AArch64::STRWpre;
    281   case AArch64::STRXui:
    282     return AArch64::STRXpre;
    283   case AArch64::LDRSui:
    284     return AArch64::LDRSpre;
    285   case AArch64::LDRDui:
    286     return AArch64::LDRDpre;
    287   case AArch64::LDRQui:
    288     return AArch64::LDRQpre;
    289   case AArch64::LDRWui:
    290     return AArch64::LDRWpre;
    291   case AArch64::LDRXui:
    292     return AArch64::LDRXpre;
    293   case AArch64::LDRSWui:
    294     return AArch64::LDRSWpre;
    295   }
    296 }
    297 
    298 static unsigned getPostIndexedOpcode(unsigned Opc) {
    299   switch (Opc) {
    300   default:
    301     llvm_unreachable("Opcode has no post-indexed wise equivalent!");
    302   case AArch64::STRSui:
    303     return AArch64::STRSpost;
    304   case AArch64::STRDui:
    305     return AArch64::STRDpost;
    306   case AArch64::STRQui:
    307     return AArch64::STRQpost;
    308   case AArch64::STRWui:
    309     return AArch64::STRWpost;
    310   case AArch64::STRXui:
    311     return AArch64::STRXpost;
    312   case AArch64::LDRSui:
    313     return AArch64::LDRSpost;
    314   case AArch64::LDRDui:
    315     return AArch64::LDRDpost;
    316   case AArch64::LDRQui:
    317     return AArch64::LDRQpost;
    318   case AArch64::LDRWui:
    319     return AArch64::LDRWpost;
    320   case AArch64::LDRXui:
    321     return AArch64::LDRXpost;
    322   case AArch64::LDRSWui:
    323     return AArch64::LDRSWpost;
    324   }
    325 }
    326 
    327 MachineBasicBlock::iterator
    328 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
    329                                       MachineBasicBlock::iterator Paired,
    330                                       bool MergeForward, int SExtIdx) {
    331   MachineBasicBlock::iterator NextI = I;
    332   ++NextI;
    333   // If NextI is the second of the two instructions to be merged, we need
    334   // to skip one further. Either way we merge will invalidate the iterator,
    335   // and we don't need to scan the new instruction, as it's a pairwise
    336   // instruction, which we're not considering for further action anyway.
    337   if (NextI == Paired)
    338     ++NextI;
    339 
    340   unsigned Opc =
    341       SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
    342   bool IsUnscaled = isUnscaledLdst(Opc);
    343   int OffsetStride =
    344       IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1;
    345 
    346   unsigned NewOpc = getMatchingPairOpcode(Opc);
    347   // Insert our new paired instruction after whichever of the paired
    348   // instructions MergeForward indicates.
    349   MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
    350   // Also based on MergeForward is from where we copy the base register operand
    351   // so we get the flags compatible with the input code.
    352   MachineOperand &BaseRegOp =
    353       MergeForward ? Paired->getOperand(1) : I->getOperand(1);
    354 
    355   // Which register is Rt and which is Rt2 depends on the offset order.
    356   MachineInstr *RtMI, *Rt2MI;
    357   if (I->getOperand(2).getImm() ==
    358       Paired->getOperand(2).getImm() + OffsetStride) {
    359     RtMI = Paired;
    360     Rt2MI = I;
    361     // Here we swapped the assumption made for SExtIdx.
    362     // I.e., we turn ldp I, Paired into ldp Paired, I.
    363     // Update the index accordingly.
    364     if (SExtIdx != -1)
    365       SExtIdx = (SExtIdx + 1) % 2;
    366   } else {
    367     RtMI = I;
    368     Rt2MI = Paired;
    369   }
    370   // Handle Unscaled
    371   int OffsetImm = RtMI->getOperand(2).getImm();
    372   if (IsUnscaled && EnableAArch64UnscaledMemOp)
    373     OffsetImm /= OffsetStride;
    374 
    375   // Construct the new instruction.
    376   MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint,
    377                                     I->getDebugLoc(), TII->get(NewOpc))
    378                                 .addOperand(RtMI->getOperand(0))
    379                                 .addOperand(Rt2MI->getOperand(0))
    380                                 .addOperand(BaseRegOp)
    381                                 .addImm(OffsetImm);
    382   (void)MIB;
    383 
    384   // FIXME: Do we need/want to copy the mem operands from the source
    385   //        instructions? Probably. What uses them after this?
    386 
    387   DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
    388   DEBUG(I->print(dbgs()));
    389   DEBUG(dbgs() << "    ");
    390   DEBUG(Paired->print(dbgs()));
    391   DEBUG(dbgs() << "  with instruction:\n    ");
    392 
    393   if (SExtIdx != -1) {
    394     // Generate the sign extension for the proper result of the ldp.
    395     // I.e., with X1, that would be:
    396     // %W1<def> = KILL %W1, %X1<imp-def>
    397     // %X1<def> = SBFMXri %X1<kill>, 0, 31
    398     MachineOperand &DstMO = MIB->getOperand(SExtIdx);
    399     // Right now, DstMO has the extended register, since it comes from an
    400     // extended opcode.
    401     unsigned DstRegX = DstMO.getReg();
    402     // Get the W variant of that register.
    403     unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
    404     // Update the result of LDP to use the W instead of the X variant.
    405     DstMO.setReg(DstRegW);
    406     DEBUG(((MachineInstr *)MIB)->print(dbgs()));
    407     DEBUG(dbgs() << "\n");
    408     // Make the machine verifier happy by providing a definition for
    409     // the X register.
    410     // Insert this definition right after the generated LDP, i.e., before
    411     // InsertionPoint.
    412     MachineInstrBuilder MIBKill =
    413         BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
    414                 TII->get(TargetOpcode::KILL), DstRegW)
    415             .addReg(DstRegW)
    416             .addReg(DstRegX, RegState::Define);
    417     MIBKill->getOperand(2).setImplicit();
    418     // Create the sign extension.
    419     MachineInstrBuilder MIBSXTW =
    420         BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
    421                 TII->get(AArch64::SBFMXri), DstRegX)
    422             .addReg(DstRegX)
    423             .addImm(0)
    424             .addImm(31);
    425     (void)MIBSXTW;
    426     DEBUG(dbgs() << "  Extend operand:\n    ");
    427     DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
    428     DEBUG(dbgs() << "\n");
    429   } else {
    430     DEBUG(((MachineInstr *)MIB)->print(dbgs()));
    431     DEBUG(dbgs() << "\n");
    432   }
    433 
    434   // Erase the old instructions.
    435   I->eraseFromParent();
    436   Paired->eraseFromParent();
    437 
    438   return NextI;
    439 }
    440 
    441 /// trackRegDefsUses - Remember what registers the specified instruction uses
    442 /// and modifies.
    443 static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs,
    444                              BitVector &UsedRegs,
    445                              const TargetRegisterInfo *TRI) {
    446   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
    447     MachineOperand &MO = MI->getOperand(i);
    448     if (MO.isRegMask())
    449       ModifiedRegs.setBitsNotInMask(MO.getRegMask());
    450 
    451     if (!MO.isReg())
    452       continue;
    453     unsigned Reg = MO.getReg();
    454     if (MO.isDef()) {
    455       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
    456         ModifiedRegs.set(*AI);
    457     } else {
    458       assert(MO.isUse() && "Reg operand not a def and not a use?!?");
    459       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
    460         UsedRegs.set(*AI);
    461     }
    462   }
    463 }
    464 
    465 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
    466   if (!IsUnscaled && (Offset > 63 || Offset < -64))
    467     return false;
    468   if (IsUnscaled) {
    469     // Convert the byte-offset used by unscaled into an "element" offset used
    470     // by the scaled pair load/store instructions.
    471     int ElemOffset = Offset / OffsetStride;
    472     if (ElemOffset > 63 || ElemOffset < -64)
    473       return false;
    474   }
    475   return true;
    476 }
    477 
    478 // Do alignment, specialized to power of 2 and for signed ints,
    479 // avoiding having to do a C-style cast from uint_64t to int when
    480 // using RoundUpToAlignment from include/llvm/Support/MathExtras.h.
    481 // FIXME: Move this function to include/MathExtras.h?
    482 static int alignTo(int Num, int PowOf2) {
    483   return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
    484 }
    485 
    486 /// findMatchingInsn - Scan the instructions looking for a load/store that can
    487 /// be combined with the current instruction into a load/store pair.
    488 MachineBasicBlock::iterator
    489 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
    490                                       bool &MergeForward, int &SExtIdx,
    491                                       unsigned Limit) {
    492   MachineBasicBlock::iterator E = I->getParent()->end();
    493   MachineBasicBlock::iterator MBBI = I;
    494   MachineInstr *FirstMI = I;
    495   ++MBBI;
    496 
    497   int Opc = FirstMI->getOpcode();
    498   bool MayLoad = FirstMI->mayLoad();
    499   bool IsUnscaled = isUnscaledLdst(Opc);
    500   unsigned Reg = FirstMI->getOperand(0).getReg();
    501   unsigned BaseReg = FirstMI->getOperand(1).getReg();
    502   int Offset = FirstMI->getOperand(2).getImm();
    503 
    504   // Early exit if the first instruction modifies the base register.
    505   // e.g., ldr x0, [x0]
    506   // Early exit if the offset if not possible to match. (6 bits of positive
    507   // range, plus allow an extra one in case we find a later insn that matches
    508   // with Offset-1
    509   if (FirstMI->modifiesRegister(BaseReg, TRI))
    510     return E;
    511   int OffsetStride =
    512       IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(FirstMI) : 1;
    513   if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
    514     return E;
    515 
    516   // Track which registers have been modified and used between the first insn
    517   // (inclusive) and the second insn.
    518   BitVector ModifiedRegs, UsedRegs;
    519   ModifiedRegs.resize(TRI->getNumRegs());
    520   UsedRegs.resize(TRI->getNumRegs());
    521   for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
    522     MachineInstr *MI = MBBI;
    523     // Skip DBG_VALUE instructions. Otherwise debug info can affect the
    524     // optimization by changing how far we scan.
    525     if (MI->isDebugValue())
    526       continue;
    527 
    528     // Now that we know this is a real instruction, count it.
    529     ++Count;
    530 
    531     bool CanMergeOpc = Opc == MI->getOpcode();
    532     SExtIdx = -1;
    533     if (!CanMergeOpc) {
    534       bool IsValidLdStrOpc;
    535       unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc);
    536       if (!IsValidLdStrOpc)
    537         continue;
    538       // Opc will be the first instruction in the pair.
    539       SExtIdx = NonSExtOpc == (unsigned)Opc ? 1 : 0;
    540       CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode());
    541     }
    542 
    543     if (CanMergeOpc && MI->getOperand(2).isImm()) {
    544       // If we've found another instruction with the same opcode, check to see
    545       // if the base and offset are compatible with our starting instruction.
    546       // These instructions all have scaled immediate operands, so we just
    547       // check for +1/-1. Make sure to check the new instruction offset is
    548       // actually an immediate and not a symbolic reference destined for
    549       // a relocation.
    550       //
    551       // Pairwise instructions have a 7-bit signed offset field. Single insns
    552       // have a 12-bit unsigned offset field. To be a valid combine, the
    553       // final offset must be in range.
    554       unsigned MIBaseReg = MI->getOperand(1).getReg();
    555       int MIOffset = MI->getOperand(2).getImm();
    556       if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
    557                                    (Offset + OffsetStride == MIOffset))) {
    558         int MinOffset = Offset < MIOffset ? Offset : MIOffset;
    559         // If this is a volatile load/store that otherwise matched, stop looking
    560         // as something is going on that we don't have enough information to
    561         // safely transform. Similarly, stop if we see a hint to avoid pairs.
    562         if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
    563           return E;
    564         // If the resultant immediate offset of merging these instructions
    565         // is out of range for a pairwise instruction, bail and keep looking.
    566         bool MIIsUnscaled = isUnscaledLdst(MI->getOpcode());
    567         if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
    568           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    569           continue;
    570         }
    571         // If the alignment requirements of the paired (scaled) instruction
    572         // can't express the offset of the unscaled input, bail and keep
    573         // looking.
    574         if (IsUnscaled && EnableAArch64UnscaledMemOp &&
    575             (alignTo(MinOffset, OffsetStride) != MinOffset)) {
    576           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    577           continue;
    578         }
    579         // If the destination register of the loads is the same register, bail
    580         // and keep looking. A load-pair instruction with both destination
    581         // registers the same is UNPREDICTABLE and will result in an exception.
    582         if (MayLoad && Reg == MI->getOperand(0).getReg()) {
    583           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    584           continue;
    585         }
    586 
    587         // If the Rt of the second instruction was not modified or used between
    588         // the two instructions, we can combine the second into the first.
    589         if (!ModifiedRegs[MI->getOperand(0).getReg()] &&
    590             !UsedRegs[MI->getOperand(0).getReg()]) {
    591           MergeForward = false;
    592           return MBBI;
    593         }
    594 
    595         // Likewise, if the Rt of the first instruction is not modified or used
    596         // between the two instructions, we can combine the first into the
    597         // second.
    598         if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] &&
    599             !UsedRegs[FirstMI->getOperand(0).getReg()]) {
    600           MergeForward = true;
    601           return MBBI;
    602         }
    603         // Unable to combine these instructions due to interference in between.
    604         // Keep looking.
    605       }
    606     }
    607 
    608     // If the instruction wasn't a matching load or store, but does (or can)
    609     // modify memory, stop searching, as we don't have alias analysis or
    610     // anything like that to tell us whether the access is tromping on the
    611     // locations we care about. The big one we want to catch is calls.
    612     //
    613     // FIXME: Theoretically, we can do better than that for SP and FP based
    614     // references since we can effectively know where those are touching. It's
    615     // unclear if it's worth the extra code, though. Most paired instructions
    616     // will be sequential, perhaps with a few intervening non-memory related
    617     // instructions.
    618     if (MI->mayStore() || MI->isCall())
    619       return E;
    620     // Likewise, if we're matching a store instruction, we don't want to
    621     // move across a load, as it may be reading the same location.
    622     if (FirstMI->mayStore() && MI->mayLoad())
    623       return E;
    624 
    625     // Update modified / uses register lists.
    626     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    627 
    628     // Otherwise, if the base register is modified, we have no match, so
    629     // return early.
    630     if (ModifiedRegs[BaseReg])
    631       return E;
    632   }
    633   return E;
    634 }
    635 
    636 MachineBasicBlock::iterator
    637 AArch64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
    638                                            MachineBasicBlock::iterator Update) {
    639   assert((Update->getOpcode() == AArch64::ADDXri ||
    640           Update->getOpcode() == AArch64::SUBXri) &&
    641          "Unexpected base register update instruction to merge!");
    642   MachineBasicBlock::iterator NextI = I;
    643   // Return the instruction following the merged instruction, which is
    644   // the instruction following our unmerged load. Unless that's the add/sub
    645   // instruction we're merging, in which case it's the one after that.
    646   if (++NextI == Update)
    647     ++NextI;
    648 
    649   int Value = Update->getOperand(2).getImm();
    650   assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
    651          "Can't merge 1 << 12 offset into pre-indexed load / store");
    652   if (Update->getOpcode() == AArch64::SUBXri)
    653     Value = -Value;
    654 
    655   unsigned NewOpc = getPreIndexedOpcode(I->getOpcode());
    656   MachineInstrBuilder MIB =
    657       BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
    658           .addOperand(Update->getOperand(0))
    659           .addOperand(I->getOperand(0))
    660           .addOperand(I->getOperand(1))
    661           .addImm(Value);
    662   (void)MIB;
    663 
    664   DEBUG(dbgs() << "Creating pre-indexed load/store.");
    665   DEBUG(dbgs() << "    Replacing instructions:\n    ");
    666   DEBUG(I->print(dbgs()));
    667   DEBUG(dbgs() << "    ");
    668   DEBUG(Update->print(dbgs()));
    669   DEBUG(dbgs() << "  with instruction:\n    ");
    670   DEBUG(((MachineInstr *)MIB)->print(dbgs()));
    671   DEBUG(dbgs() << "\n");
    672 
    673   // Erase the old instructions for the block.
    674   I->eraseFromParent();
    675   Update->eraseFromParent();
    676 
    677   return NextI;
    678 }
    679 
    680 MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn(
    681     MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update) {
    682   assert((Update->getOpcode() == AArch64::ADDXri ||
    683           Update->getOpcode() == AArch64::SUBXri) &&
    684          "Unexpected base register update instruction to merge!");
    685   MachineBasicBlock::iterator NextI = I;
    686   // Return the instruction following the merged instruction, which is
    687   // the instruction following our unmerged load. Unless that's the add/sub
    688   // instruction we're merging, in which case it's the one after that.
    689   if (++NextI == Update)
    690     ++NextI;
    691 
    692   int Value = Update->getOperand(2).getImm();
    693   assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
    694          "Can't merge 1 << 12 offset into post-indexed load / store");
    695   if (Update->getOpcode() == AArch64::SUBXri)
    696     Value = -Value;
    697 
    698   unsigned NewOpc = getPostIndexedOpcode(I->getOpcode());
    699   MachineInstrBuilder MIB =
    700       BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
    701           .addOperand(Update->getOperand(0))
    702           .addOperand(I->getOperand(0))
    703           .addOperand(I->getOperand(1))
    704           .addImm(Value);
    705   (void)MIB;
    706 
    707   DEBUG(dbgs() << "Creating post-indexed load/store.");
    708   DEBUG(dbgs() << "    Replacing instructions:\n    ");
    709   DEBUG(I->print(dbgs()));
    710   DEBUG(dbgs() << "    ");
    711   DEBUG(Update->print(dbgs()));
    712   DEBUG(dbgs() << "  with instruction:\n    ");
    713   DEBUG(((MachineInstr *)MIB)->print(dbgs()));
    714   DEBUG(dbgs() << "\n");
    715 
    716   // Erase the old instructions for the block.
    717   I->eraseFromParent();
    718   Update->eraseFromParent();
    719 
    720   return NextI;
    721 }
    722 
    723 static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg,
    724                                  int Offset) {
    725   switch (MI->getOpcode()) {
    726   default:
    727     break;
    728   case AArch64::SUBXri:
    729     // Negate the offset for a SUB instruction.
    730     Offset *= -1;
    731   // FALLTHROUGH
    732   case AArch64::ADDXri:
    733     // Make sure it's a vanilla immediate operand, not a relocation or
    734     // anything else we can't handle.
    735     if (!MI->getOperand(2).isImm())
    736       break;
    737     // Watch out for 1 << 12 shifted value.
    738     if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm()))
    739       break;
    740     // If the instruction has the base register as source and dest and the
    741     // immediate will fit in a signed 9-bit integer, then we have a match.
    742     if (MI->getOperand(0).getReg() == BaseReg &&
    743         MI->getOperand(1).getReg() == BaseReg &&
    744         MI->getOperand(2).getImm() <= 255 &&
    745         MI->getOperand(2).getImm() >= -256) {
    746       // If we have a non-zero Offset, we check that it matches the amount
    747       // we're adding to the register.
    748       if (!Offset || Offset == MI->getOperand(2).getImm())
    749         return true;
    750     }
    751     break;
    752   }
    753   return false;
    754 }
    755 
    756 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
    757     MachineBasicBlock::iterator I, unsigned Limit, int Value) {
    758   MachineBasicBlock::iterator E = I->getParent()->end();
    759   MachineInstr *MemMI = I;
    760   MachineBasicBlock::iterator MBBI = I;
    761   const MachineFunction &MF = *MemMI->getParent()->getParent();
    762 
    763   unsigned DestReg = MemMI->getOperand(0).getReg();
    764   unsigned BaseReg = MemMI->getOperand(1).getReg();
    765   int Offset = MemMI->getOperand(2).getImm() *
    766                TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
    767 
    768   // If the base register overlaps the destination register, we can't
    769   // merge the update.
    770   if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
    771     return E;
    772 
    773   // Scan forward looking for post-index opportunities.
    774   // Updating instructions can't be formed if the memory insn already
    775   // has an offset other than the value we're looking for.
    776   if (Offset != Value)
    777     return E;
    778 
    779   // Track which registers have been modified and used between the first insn
    780   // (inclusive) and the second insn.
    781   BitVector ModifiedRegs, UsedRegs;
    782   ModifiedRegs.resize(TRI->getNumRegs());
    783   UsedRegs.resize(TRI->getNumRegs());
    784   ++MBBI;
    785   for (unsigned Count = 0; MBBI != E; ++MBBI) {
    786     MachineInstr *MI = MBBI;
    787     // Skip DBG_VALUE instructions. Otherwise debug info can affect the
    788     // optimization by changing how far we scan.
    789     if (MI->isDebugValue())
    790       continue;
    791 
    792     // Now that we know this is a real instruction, count it.
    793     ++Count;
    794 
    795     // If we found a match, return it.
    796     if (isMatchingUpdateInsn(MI, BaseReg, Value))
    797       return MBBI;
    798 
    799     // Update the status of what the instruction clobbered and used.
    800     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    801 
    802     // Otherwise, if the base register is used or modified, we have no match, so
    803     // return early.
    804     if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
    805       return E;
    806   }
    807   return E;
    808 }
    809 
    810 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
    811     MachineBasicBlock::iterator I, unsigned Limit) {
    812   MachineBasicBlock::iterator B = I->getParent()->begin();
    813   MachineBasicBlock::iterator E = I->getParent()->end();
    814   MachineInstr *MemMI = I;
    815   MachineBasicBlock::iterator MBBI = I;
    816   const MachineFunction &MF = *MemMI->getParent()->getParent();
    817 
    818   unsigned DestReg = MemMI->getOperand(0).getReg();
    819   unsigned BaseReg = MemMI->getOperand(1).getReg();
    820   int Offset = MemMI->getOperand(2).getImm();
    821   unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
    822 
    823   // If the load/store is the first instruction in the block, there's obviously
    824   // not any matching update. Ditto if the memory offset isn't zero.
    825   if (MBBI == B || Offset != 0)
    826     return E;
    827   // If the base register overlaps the destination register, we can't
    828   // merge the update.
    829   if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
    830     return E;
    831 
    832   // Track which registers have been modified and used between the first insn
    833   // (inclusive) and the second insn.
    834   BitVector ModifiedRegs, UsedRegs;
    835   ModifiedRegs.resize(TRI->getNumRegs());
    836   UsedRegs.resize(TRI->getNumRegs());
    837   --MBBI;
    838   for (unsigned Count = 0; MBBI != B; --MBBI) {
    839     MachineInstr *MI = MBBI;
    840     // Skip DBG_VALUE instructions. Otherwise debug info can affect the
    841     // optimization by changing how far we scan.
    842     if (MI->isDebugValue())
    843       continue;
    844 
    845     // Now that we know this is a real instruction, count it.
    846     ++Count;
    847 
    848     // If we found a match, return it.
    849     if (isMatchingUpdateInsn(MI, BaseReg, RegSize))
    850       return MBBI;
    851 
    852     // Update the status of what the instruction clobbered and used.
    853     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    854 
    855     // Otherwise, if the base register is used or modified, we have no match, so
    856     // return early.
    857     if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
    858       return E;
    859   }
    860   return E;
    861 }
    862 
    863 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
    864   bool Modified = false;
    865   // Two tranformations to do here:
    866   // 1) Find loads and stores that can be merged into a single load or store
    867   //    pair instruction.
    868   //      e.g.,
    869   //        ldr x0, [x2]
    870   //        ldr x1, [x2, #8]
    871   //        ; becomes
    872   //        ldp x0, x1, [x2]
    873   // 2) Find base register updates that can be merged into the load or store
    874   //    as a base-reg writeback.
    875   //      e.g.,
    876   //        ldr x0, [x2]
    877   //        add x2, x2, #4
    878   //        ; becomes
    879   //        ldr x0, [x2], #4
    880 
    881   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
    882        MBBI != E;) {
    883     MachineInstr *MI = MBBI;
    884     switch (MI->getOpcode()) {
    885     default:
    886       // Just move on to the next instruction.
    887       ++MBBI;
    888       break;
    889     case AArch64::STRSui:
    890     case AArch64::STRDui:
    891     case AArch64::STRQui:
    892     case AArch64::STRXui:
    893     case AArch64::STRWui:
    894     case AArch64::LDRSui:
    895     case AArch64::LDRDui:
    896     case AArch64::LDRQui:
    897     case AArch64::LDRXui:
    898     case AArch64::LDRWui:
    899     case AArch64::LDRSWui:
    900     // do the unscaled versions as well
    901     case AArch64::STURSi:
    902     case AArch64::STURDi:
    903     case AArch64::STURQi:
    904     case AArch64::STURWi:
    905     case AArch64::STURXi:
    906     case AArch64::LDURSi:
    907     case AArch64::LDURDi:
    908     case AArch64::LDURQi:
    909     case AArch64::LDURWi:
    910     case AArch64::LDURXi:
    911     case AArch64::LDURSWi: {
    912       // If this is a volatile load/store, don't mess with it.
    913       if (MI->hasOrderedMemoryRef()) {
    914         ++MBBI;
    915         break;
    916       }
    917       // Make sure this is a reg+imm (as opposed to an address reloc).
    918       if (!MI->getOperand(2).isImm()) {
    919         ++MBBI;
    920         break;
    921       }
    922       // Check if this load/store has a hint to avoid pair formation.
    923       // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
    924       if (TII->isLdStPairSuppressed(MI)) {
    925         ++MBBI;
    926         break;
    927       }
    928       // Look ahead up to ScanLimit instructions for a pairable instruction.
    929       bool MergeForward = false;
    930       int SExtIdx = -1;
    931       MachineBasicBlock::iterator Paired =
    932           findMatchingInsn(MBBI, MergeForward, SExtIdx, ScanLimit);
    933       if (Paired != E) {
    934         // Merge the loads into a pair. Keeping the iterator straight is a
    935         // pain, so we let the merge routine tell us what the next instruction
    936         // is after it's done mucking about.
    937         MBBI = mergePairedInsns(MBBI, Paired, MergeForward, SExtIdx);
    938 
    939         Modified = true;
    940         ++NumPairCreated;
    941         if (isUnscaledLdst(MI->getOpcode()))
    942           ++NumUnscaledPairCreated;
    943         break;
    944       }
    945       ++MBBI;
    946       break;
    947     }
    948       // FIXME: Do the other instructions.
    949     }
    950   }
    951 
    952   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
    953        MBBI != E;) {
    954     MachineInstr *MI = MBBI;
    955     // Do update merging. It's simpler to keep this separate from the above
    956     // switch, though not strictly necessary.
    957     int Opc = MI->getOpcode();
    958     switch (Opc) {
    959     default:
    960       // Just move on to the next instruction.
    961       ++MBBI;
    962       break;
    963     case AArch64::STRSui:
    964     case AArch64::STRDui:
    965     case AArch64::STRQui:
    966     case AArch64::STRXui:
    967     case AArch64::STRWui:
    968     case AArch64::LDRSui:
    969     case AArch64::LDRDui:
    970     case AArch64::LDRQui:
    971     case AArch64::LDRXui:
    972     case AArch64::LDRWui:
    973     // do the unscaled versions as well
    974     case AArch64::STURSi:
    975     case AArch64::STURDi:
    976     case AArch64::STURQi:
    977     case AArch64::STURWi:
    978     case AArch64::STURXi:
    979     case AArch64::LDURSi:
    980     case AArch64::LDURDi:
    981     case AArch64::LDURQi:
    982     case AArch64::LDURWi:
    983     case AArch64::LDURXi: {
    984       // Make sure this is a reg+imm (as opposed to an address reloc).
    985       if (!MI->getOperand(2).isImm()) {
    986         ++MBBI;
    987         break;
    988       }
    989       // Look ahead up to ScanLimit instructions for a mergable instruction.
    990       MachineBasicBlock::iterator Update =
    991           findMatchingUpdateInsnForward(MBBI, ScanLimit, 0);
    992       if (Update != E) {
    993         // Merge the update into the ld/st.
    994         MBBI = mergePostIdxUpdateInsn(MBBI, Update);
    995         Modified = true;
    996         ++NumPostFolded;
    997         break;
    998       }
    999       // Don't know how to handle pre/post-index versions, so move to the next
   1000       // instruction.
   1001       if (isUnscaledLdst(Opc)) {
   1002         ++MBBI;
   1003         break;
   1004       }
   1005 
   1006       // Look back to try to find a pre-index instruction. For example,
   1007       // add x0, x0, #8
   1008       // ldr x1, [x0]
   1009       //   merged into:
   1010       // ldr x1, [x0, #8]!
   1011       Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit);
   1012       if (Update != E) {
   1013         // Merge the update into the ld/st.
   1014         MBBI = mergePreIdxUpdateInsn(MBBI, Update);
   1015         Modified = true;
   1016         ++NumPreFolded;
   1017         break;
   1018       }
   1019 
   1020       // Look forward to try to find a post-index instruction. For example,
   1021       // ldr x1, [x0, #64]
   1022       // add x0, x0, #64
   1023       //   merged into:
   1024       // ldr x1, [x0, #64]!
   1025 
   1026       // The immediate in the load/store is scaled by the size of the register
   1027       // being loaded. The immediate in the add we're looking for,
   1028       // however, is not, so adjust here.
   1029       int Value = MI->getOperand(2).getImm() *
   1030                   TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent()))
   1031                       ->getSize();
   1032       Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value);
   1033       if (Update != E) {
   1034         // Merge the update into the ld/st.
   1035         MBBI = mergePreIdxUpdateInsn(MBBI, Update);
   1036         Modified = true;
   1037         ++NumPreFolded;
   1038         break;
   1039       }
   1040 
   1041       // Nothing found. Just move to the next instruction.
   1042       ++MBBI;
   1043       break;
   1044     }
   1045       // FIXME: Do the other instructions.
   1046     }
   1047   }
   1048 
   1049   return Modified;
   1050 }
   1051 
   1052 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   1053   TII = static_cast<const AArch64InstrInfo *>(Fn.getSubtarget().getInstrInfo());
   1054   TRI = Fn.getSubtarget().getRegisterInfo();
   1055 
   1056   bool Modified = false;
   1057   for (auto &MBB : Fn)
   1058     Modified |= optimizeBlock(MBB);
   1059 
   1060   return Modified;
   1061 }
   1062 
   1063 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
   1064 // loads and stores near one another?
   1065 
   1066 /// createARMLoadStoreOptimizationPass - returns an instance of the load / store
   1067 /// optimization pass.
   1068 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
   1069   return new AArch64LoadStoreOpt();
   1070 }
   1071