Home | History | Annotate | Download | only in AArch64
      1 //=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file contains a pass that performs load / store related peephole
     11 // optimizations. This pass should be run after register allocation.
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "AArch64InstrInfo.h"
     16 #include "AArch64Subtarget.h"
     17 #include "MCTargetDesc/AArch64AddressingModes.h"
     18 #include "llvm/ADT/BitVector.h"
     19 #include "llvm/ADT/SmallVector.h"
     20 #include "llvm/ADT/Statistic.h"
     21 #include "llvm/CodeGen/MachineBasicBlock.h"
     22 #include "llvm/CodeGen/MachineFunctionPass.h"
     23 #include "llvm/CodeGen/MachineInstr.h"
     24 #include "llvm/CodeGen/MachineInstrBuilder.h"
     25 #include "llvm/Support/CommandLine.h"
     26 #include "llvm/Support/Debug.h"
     27 #include "llvm/Support/ErrorHandling.h"
     28 #include "llvm/Support/raw_ostream.h"
     29 #include "llvm/Target/TargetInstrInfo.h"
     30 #include "llvm/Target/TargetMachine.h"
     31 #include "llvm/Target/TargetRegisterInfo.h"
     32 using namespace llvm;
     33 
     34 #define DEBUG_TYPE "aarch64-ldst-opt"
     35 
     36 /// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine
     37 /// load / store instructions to form ldp / stp instructions.
     38 
     39 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
     40 STATISTIC(NumPostFolded, "Number of post-index updates folded");
     41 STATISTIC(NumPreFolded, "Number of pre-index updates folded");
     42 STATISTIC(NumUnscaledPairCreated,
     43           "Number of load/store from unscaled generated");
     44 STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted");
     45 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
     46 
     47 static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit",
     48                                    cl::init(20), cl::Hidden);
     49 
     50 namespace llvm {
     51 void initializeAArch64LoadStoreOptPass(PassRegistry &);
     52 }
     53 
     54 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
     55 
     56 namespace {
     57 
     58 typedef struct LdStPairFlags {
     59   // If a matching instruction is found, MergeForward is set to true if the
     60   // merge is to remove the first instruction and replace the second with
     61   // a pair-wise insn, and false if the reverse is true.
     62   bool MergeForward;
     63 
     64   // SExtIdx gives the index of the result of the load pair that must be
     65   // extended. The value of SExtIdx assumes that the paired load produces the
     66   // value in this order: (I, returned iterator), i.e., -1 means no value has
     67   // to be extended, 0 means I, and 1 means the returned iterator.
     68   int SExtIdx;
     69 
     70   LdStPairFlags() : MergeForward(false), SExtIdx(-1) {}
     71 
     72   void setMergeForward(bool V = true) { MergeForward = V; }
     73   bool getMergeForward() const { return MergeForward; }
     74 
     75   void setSExtIdx(int V) { SExtIdx = V; }
     76   int getSExtIdx() const { return SExtIdx; }
     77 
     78 } LdStPairFlags;
     79 
     80 struct AArch64LoadStoreOpt : public MachineFunctionPass {
     81   static char ID;
     82   AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
     83     initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
     84   }
     85 
     86   const AArch64InstrInfo *TII;
     87   const TargetRegisterInfo *TRI;
     88   const AArch64Subtarget *Subtarget;
     89 
     90   // Scan the instructions looking for a load/store that can be combined
     91   // with the current instruction into a load/store pair.
     92   // Return the matching instruction if one is found, else MBB->end().
     93   MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
     94                                                LdStPairFlags &Flags,
     95                                                unsigned Limit);
     96   // Merge the two instructions indicated into a single pair-wise instruction.
     97   // If MergeForward is true, erase the first instruction and fold its
     98   // operation into the second. If false, the reverse. Return the instruction
     99   // following the first instruction (which may change during processing).
    100   MachineBasicBlock::iterator
    101   mergePairedInsns(MachineBasicBlock::iterator I,
    102                    MachineBasicBlock::iterator Paired,
    103                    const LdStPairFlags &Flags);
    104 
    105   // Scan the instruction list to find a base register update that can
    106   // be combined with the current instruction (a load or store) using
    107   // pre or post indexed addressing with writeback. Scan forwards.
    108   MachineBasicBlock::iterator
    109   findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit,
    110                                 int UnscaledOffset);
    111 
    112   // Scan the instruction list to find a base register update that can
    113   // be combined with the current instruction (a load or store) using
    114   // pre or post indexed addressing with writeback. Scan backwards.
    115   MachineBasicBlock::iterator
    116   findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
    117 
    118   // Find an instruction that updates the base register of the ld/st
    119   // instruction.
    120   bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI,
    121                             unsigned BaseReg, int Offset);
    122 
    123   // Merge a pre- or post-index base register update into a ld/st instruction.
    124   MachineBasicBlock::iterator
    125   mergeUpdateInsn(MachineBasicBlock::iterator I,
    126                   MachineBasicBlock::iterator Update, bool IsPreIdx);
    127 
    128   // Find and merge foldable ldr/str instructions.
    129   bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);
    130 
    131   // Check if converting two narrow loads into a single wider load with
    132   // bitfield extracts could be enabled.
    133   bool enableNarrowLdMerge(MachineFunction &Fn);
    134 
    135   bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);
    136 
    137   bool runOnMachineFunction(MachineFunction &Fn) override;
    138 
    139   const char *getPassName() const override {
    140     return AARCH64_LOAD_STORE_OPT_NAME;
    141   }
    142 };
    143 char AArch64LoadStoreOpt::ID = 0;
    144 } // namespace
    145 
    146 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
    147                 AARCH64_LOAD_STORE_OPT_NAME, false, false)
    148 
    149 static bool isUnscaledLdSt(unsigned Opc) {
    150   switch (Opc) {
    151   default:
    152     return false;
    153   case AArch64::STURSi:
    154   case AArch64::STURDi:
    155   case AArch64::STURQi:
    156   case AArch64::STURBBi:
    157   case AArch64::STURHHi:
    158   case AArch64::STURWi:
    159   case AArch64::STURXi:
    160   case AArch64::LDURSi:
    161   case AArch64::LDURDi:
    162   case AArch64::LDURQi:
    163   case AArch64::LDURWi:
    164   case AArch64::LDURXi:
    165   case AArch64::LDURSWi:
    166   case AArch64::LDURHHi:
    167   case AArch64::LDURBBi:
    168   case AArch64::LDURSBWi:
    169   case AArch64::LDURSHWi:
    170     return true;
    171   }
    172 }
    173 
    174 static bool isUnscaledLdSt(MachineInstr *MI) {
    175   return isUnscaledLdSt(MI->getOpcode());
    176 }
    177 
    178 static unsigned getBitExtrOpcode(MachineInstr *MI) {
    179   switch (MI->getOpcode()) {
    180   default:
    181     llvm_unreachable("Unexpected opcode.");
    182   case AArch64::LDRBBui:
    183   case AArch64::LDURBBi:
    184   case AArch64::LDRHHui:
    185   case AArch64::LDURHHi:
    186     return AArch64::UBFMWri;
    187   case AArch64::LDRSBWui:
    188   case AArch64::LDURSBWi:
    189   case AArch64::LDRSHWui:
    190   case AArch64::LDURSHWi:
    191     return AArch64::SBFMWri;
    192   }
    193 }
    194 
    195 static bool isNarrowStore(unsigned Opc) {
    196   switch (Opc) {
    197   default:
    198     return false;
    199   case AArch64::STRBBui:
    200   case AArch64::STURBBi:
    201   case AArch64::STRHHui:
    202   case AArch64::STURHHi:
    203     return true;
    204   }
    205 }
    206 
    207 static bool isNarrowStore(MachineInstr *MI) {
    208   return isNarrowStore(MI->getOpcode());
    209 }
    210 
    211 static bool isNarrowLoad(unsigned Opc) {
    212   switch (Opc) {
    213   default:
    214     return false;
    215   case AArch64::LDRHHui:
    216   case AArch64::LDURHHi:
    217   case AArch64::LDRBBui:
    218   case AArch64::LDURBBi:
    219   case AArch64::LDRSHWui:
    220   case AArch64::LDURSHWi:
    221   case AArch64::LDRSBWui:
    222   case AArch64::LDURSBWi:
    223     return true;
    224   }
    225 }
    226 
    227 static bool isNarrowLoad(MachineInstr *MI) {
    228   return isNarrowLoad(MI->getOpcode());
    229 }
    230 
    231 // Scaling factor for unscaled load or store.
    232 static int getMemScale(MachineInstr *MI) {
    233   switch (MI->getOpcode()) {
    234   default:
    235     llvm_unreachable("Opcode has unknown scale!");
    236   case AArch64::LDRBBui:
    237   case AArch64::LDURBBi:
    238   case AArch64::LDRSBWui:
    239   case AArch64::LDURSBWi:
    240   case AArch64::STRBBui:
    241   case AArch64::STURBBi:
    242     return 1;
    243   case AArch64::LDRHHui:
    244   case AArch64::LDURHHi:
    245   case AArch64::LDRSHWui:
    246   case AArch64::LDURSHWi:
    247   case AArch64::STRHHui:
    248   case AArch64::STURHHi:
    249     return 2;
    250   case AArch64::LDRSui:
    251   case AArch64::LDURSi:
    252   case AArch64::LDRSWui:
    253   case AArch64::LDURSWi:
    254   case AArch64::LDRWui:
    255   case AArch64::LDURWi:
    256   case AArch64::STRSui:
    257   case AArch64::STURSi:
    258   case AArch64::STRWui:
    259   case AArch64::STURWi:
    260   case AArch64::LDPSi:
    261   case AArch64::LDPSWi:
    262   case AArch64::LDPWi:
    263   case AArch64::STPSi:
    264   case AArch64::STPWi:
    265     return 4;
    266   case AArch64::LDRDui:
    267   case AArch64::LDURDi:
    268   case AArch64::LDRXui:
    269   case AArch64::LDURXi:
    270   case AArch64::STRDui:
    271   case AArch64::STURDi:
    272   case AArch64::STRXui:
    273   case AArch64::STURXi:
    274   case AArch64::LDPDi:
    275   case AArch64::LDPXi:
    276   case AArch64::STPDi:
    277   case AArch64::STPXi:
    278     return 8;
    279   case AArch64::LDRQui:
    280   case AArch64::LDURQi:
    281   case AArch64::STRQui:
    282   case AArch64::STURQi:
    283   case AArch64::LDPQi:
    284   case AArch64::STPQi:
    285     return 16;
    286   }
    287 }
    288 
    289 static unsigned getMatchingNonSExtOpcode(unsigned Opc,
    290                                          bool *IsValidLdStrOpc = nullptr) {
    291   if (IsValidLdStrOpc)
    292     *IsValidLdStrOpc = true;
    293   switch (Opc) {
    294   default:
    295     if (IsValidLdStrOpc)
    296       *IsValidLdStrOpc = false;
    297     return UINT_MAX;
    298   case AArch64::STRDui:
    299   case AArch64::STURDi:
    300   case AArch64::STRQui:
    301   case AArch64::STURQi:
    302   case AArch64::STRBBui:
    303   case AArch64::STURBBi:
    304   case AArch64::STRHHui:
    305   case AArch64::STURHHi:
    306   case AArch64::STRWui:
    307   case AArch64::STURWi:
    308   case AArch64::STRXui:
    309   case AArch64::STURXi:
    310   case AArch64::LDRDui:
    311   case AArch64::LDURDi:
    312   case AArch64::LDRQui:
    313   case AArch64::LDURQi:
    314   case AArch64::LDRWui:
    315   case AArch64::LDURWi:
    316   case AArch64::LDRXui:
    317   case AArch64::LDURXi:
    318   case AArch64::STRSui:
    319   case AArch64::STURSi:
    320   case AArch64::LDRSui:
    321   case AArch64::LDURSi:
    322   case AArch64::LDRHHui:
    323   case AArch64::LDURHHi:
    324   case AArch64::LDRBBui:
    325   case AArch64::LDURBBi:
    326     return Opc;
    327   case AArch64::LDRSWui:
    328     return AArch64::LDRWui;
    329   case AArch64::LDURSWi:
    330     return AArch64::LDURWi;
    331   case AArch64::LDRSBWui:
    332     return AArch64::LDRBBui;
    333   case AArch64::LDRSHWui:
    334     return AArch64::LDRHHui;
    335   case AArch64::LDURSBWi:
    336     return AArch64::LDURBBi;
    337   case AArch64::LDURSHWi:
    338     return AArch64::LDURHHi;
    339   }
    340 }
    341 
    342 static unsigned getMatchingPairOpcode(unsigned Opc) {
    343   switch (Opc) {
    344   default:
    345     llvm_unreachable("Opcode has no pairwise equivalent!");
    346   case AArch64::STRSui:
    347   case AArch64::STURSi:
    348     return AArch64::STPSi;
    349   case AArch64::STRDui:
    350   case AArch64::STURDi:
    351     return AArch64::STPDi;
    352   case AArch64::STRQui:
    353   case AArch64::STURQi:
    354     return AArch64::STPQi;
    355   case AArch64::STRBBui:
    356     return AArch64::STRHHui;
    357   case AArch64::STRHHui:
    358     return AArch64::STRWui;
    359   case AArch64::STURBBi:
    360     return AArch64::STURHHi;
    361   case AArch64::STURHHi:
    362     return AArch64::STURWi;
    363   case AArch64::STRWui:
    364   case AArch64::STURWi:
    365     return AArch64::STPWi;
    366   case AArch64::STRXui:
    367   case AArch64::STURXi:
    368     return AArch64::STPXi;
    369   case AArch64::LDRSui:
    370   case AArch64::LDURSi:
    371     return AArch64::LDPSi;
    372   case AArch64::LDRDui:
    373   case AArch64::LDURDi:
    374     return AArch64::LDPDi;
    375   case AArch64::LDRQui:
    376   case AArch64::LDURQi:
    377     return AArch64::LDPQi;
    378   case AArch64::LDRWui:
    379   case AArch64::LDURWi:
    380     return AArch64::LDPWi;
    381   case AArch64::LDRXui:
    382   case AArch64::LDURXi:
    383     return AArch64::LDPXi;
    384   case AArch64::LDRSWui:
    385   case AArch64::LDURSWi:
    386     return AArch64::LDPSWi;
    387   case AArch64::LDRHHui:
    388   case AArch64::LDRSHWui:
    389     return AArch64::LDRWui;
    390   case AArch64::LDURHHi:
    391   case AArch64::LDURSHWi:
    392     return AArch64::LDURWi;
    393   case AArch64::LDRBBui:
    394   case AArch64::LDRSBWui:
    395     return AArch64::LDRHHui;
    396   case AArch64::LDURBBi:
    397   case AArch64::LDURSBWi:
    398     return AArch64::LDURHHi;
    399   }
    400 }
    401 
    402 static unsigned getPreIndexedOpcode(unsigned Opc) {
    403   switch (Opc) {
    404   default:
    405     llvm_unreachable("Opcode has no pre-indexed equivalent!");
    406   case AArch64::STRSui:
    407     return AArch64::STRSpre;
    408   case AArch64::STRDui:
    409     return AArch64::STRDpre;
    410   case AArch64::STRQui:
    411     return AArch64::STRQpre;
    412   case AArch64::STRBBui:
    413     return AArch64::STRBBpre;
    414   case AArch64::STRHHui:
    415     return AArch64::STRHHpre;
    416   case AArch64::STRWui:
    417     return AArch64::STRWpre;
    418   case AArch64::STRXui:
    419     return AArch64::STRXpre;
    420   case AArch64::LDRSui:
    421     return AArch64::LDRSpre;
    422   case AArch64::LDRDui:
    423     return AArch64::LDRDpre;
    424   case AArch64::LDRQui:
    425     return AArch64::LDRQpre;
    426   case AArch64::LDRBBui:
    427     return AArch64::LDRBBpre;
    428   case AArch64::LDRHHui:
    429     return AArch64::LDRHHpre;
    430   case AArch64::LDRWui:
    431     return AArch64::LDRWpre;
    432   case AArch64::LDRXui:
    433     return AArch64::LDRXpre;
    434   case AArch64::LDRSWui:
    435     return AArch64::LDRSWpre;
    436   case AArch64::LDPSi:
    437     return AArch64::LDPSpre;
    438   case AArch64::LDPSWi:
    439     return AArch64::LDPSWpre;
    440   case AArch64::LDPDi:
    441     return AArch64::LDPDpre;
    442   case AArch64::LDPQi:
    443     return AArch64::LDPQpre;
    444   case AArch64::LDPWi:
    445     return AArch64::LDPWpre;
    446   case AArch64::LDPXi:
    447     return AArch64::LDPXpre;
    448   case AArch64::STPSi:
    449     return AArch64::STPSpre;
    450   case AArch64::STPDi:
    451     return AArch64::STPDpre;
    452   case AArch64::STPQi:
    453     return AArch64::STPQpre;
    454   case AArch64::STPWi:
    455     return AArch64::STPWpre;
    456   case AArch64::STPXi:
    457     return AArch64::STPXpre;
    458   }
    459 }
    460 
    461 static unsigned getPostIndexedOpcode(unsigned Opc) {
    462   switch (Opc) {
    463   default:
    464     llvm_unreachable("Opcode has no post-indexed wise equivalent!");
    465   case AArch64::STRSui:
    466     return AArch64::STRSpost;
    467   case AArch64::STRDui:
    468     return AArch64::STRDpost;
    469   case AArch64::STRQui:
    470     return AArch64::STRQpost;
    471   case AArch64::STRBBui:
    472     return AArch64::STRBBpost;
    473   case AArch64::STRHHui:
    474     return AArch64::STRHHpost;
    475   case AArch64::STRWui:
    476     return AArch64::STRWpost;
    477   case AArch64::STRXui:
    478     return AArch64::STRXpost;
    479   case AArch64::LDRSui:
    480     return AArch64::LDRSpost;
    481   case AArch64::LDRDui:
    482     return AArch64::LDRDpost;
    483   case AArch64::LDRQui:
    484     return AArch64::LDRQpost;
    485   case AArch64::LDRBBui:
    486     return AArch64::LDRBBpost;
    487   case AArch64::LDRHHui:
    488     return AArch64::LDRHHpost;
    489   case AArch64::LDRWui:
    490     return AArch64::LDRWpost;
    491   case AArch64::LDRXui:
    492     return AArch64::LDRXpost;
    493   case AArch64::LDRSWui:
    494     return AArch64::LDRSWpost;
    495   case AArch64::LDPSi:
    496     return AArch64::LDPSpost;
    497   case AArch64::LDPSWi:
    498     return AArch64::LDPSWpost;
    499   case AArch64::LDPDi:
    500     return AArch64::LDPDpost;
    501   case AArch64::LDPQi:
    502     return AArch64::LDPQpost;
    503   case AArch64::LDPWi:
    504     return AArch64::LDPWpost;
    505   case AArch64::LDPXi:
    506     return AArch64::LDPXpost;
    507   case AArch64::STPSi:
    508     return AArch64::STPSpost;
    509   case AArch64::STPDi:
    510     return AArch64::STPDpost;
    511   case AArch64::STPQi:
    512     return AArch64::STPQpost;
    513   case AArch64::STPWi:
    514     return AArch64::STPWpost;
    515   case AArch64::STPXi:
    516     return AArch64::STPXpost;
    517   }
    518 }
    519 
    520 static bool isPairedLdSt(const MachineInstr *MI) {
    521   switch (MI->getOpcode()) {
    522   default:
    523     return false;
    524   case AArch64::LDPSi:
    525   case AArch64::LDPSWi:
    526   case AArch64::LDPDi:
    527   case AArch64::LDPQi:
    528   case AArch64::LDPWi:
    529   case AArch64::LDPXi:
    530   case AArch64::STPSi:
    531   case AArch64::STPDi:
    532   case AArch64::STPQi:
    533   case AArch64::STPWi:
    534   case AArch64::STPXi:
    535     return true;
    536   }
    537 }
    538 
    539 static const MachineOperand &getLdStRegOp(const MachineInstr *MI,
    540                                           unsigned PairedRegOp = 0) {
    541   assert(PairedRegOp < 2 && "Unexpected register operand idx.");
    542   unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
    543   return MI->getOperand(Idx);
    544 }
    545 
    546 static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) {
    547   unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
    548   return MI->getOperand(Idx);
    549 }
    550 
    551 static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) {
    552   unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
    553   return MI->getOperand(Idx);
    554 }
    555 
    556 // Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI.
    557 static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0,
    558                                    MachineInstr *Op1) {
    559   assert(MI->memoperands_empty() && "expected a new machineinstr");
    560   size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) +
    561                       (Op1->memoperands_end() - Op1->memoperands_begin());
    562 
    563   MachineFunction *MF = MI->getParent()->getParent();
    564   MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs);
    565   MachineSDNode::mmo_iterator MemEnd =
    566       std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin);
    567   MemEnd = std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd);
    568   MI->setMemRefs(MemBegin, MemEnd);
    569 }
    570 
    571 MachineBasicBlock::iterator
    572 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
    573                                       MachineBasicBlock::iterator Paired,
    574                                       const LdStPairFlags &Flags) {
    575   MachineBasicBlock::iterator NextI = I;
    576   ++NextI;
    577   // If NextI is the second of the two instructions to be merged, we need
    578   // to skip one further. Either way we merge will invalidate the iterator,
    579   // and we don't need to scan the new instruction, as it's a pairwise
    580   // instruction, which we're not considering for further action anyway.
    581   if (NextI == Paired)
    582     ++NextI;
    583 
    584   int SExtIdx = Flags.getSExtIdx();
    585   unsigned Opc =
    586       SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
    587   bool IsUnscaled = isUnscaledLdSt(Opc);
    588   int OffsetStride = IsUnscaled ? getMemScale(I) : 1;
    589 
    590   bool MergeForward = Flags.getMergeForward();
    591   unsigned NewOpc = getMatchingPairOpcode(Opc);
    592   // Insert our new paired instruction after whichever of the paired
    593   // instructions MergeForward indicates.
    594   MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
    595   // Also based on MergeForward is from where we copy the base register operand
    596   // so we get the flags compatible with the input code.
    597   const MachineOperand &BaseRegOp =
    598       MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I);
    599 
    600   // Which register is Rt and which is Rt2 depends on the offset order.
    601   MachineInstr *RtMI, *Rt2MI;
    602   if (getLdStOffsetOp(I).getImm() ==
    603       getLdStOffsetOp(Paired).getImm() + OffsetStride) {
    604     RtMI = Paired;
    605     Rt2MI = I;
    606     // Here we swapped the assumption made for SExtIdx.
    607     // I.e., we turn ldp I, Paired into ldp Paired, I.
    608     // Update the index accordingly.
    609     if (SExtIdx != -1)
    610       SExtIdx = (SExtIdx + 1) % 2;
    611   } else {
    612     RtMI = I;
    613     Rt2MI = Paired;
    614   }
    615 
    616   int OffsetImm = getLdStOffsetOp(RtMI).getImm();
    617 
    618   if (isNarrowLoad(Opc)) {
    619     // Change the scaled offset from small to large type.
    620     if (!IsUnscaled) {
    621       assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
    622       OffsetImm /= 2;
    623     }
    624     MachineInstr *RtNewDest = MergeForward ? I : Paired;
    625     // When merging small (< 32 bit) loads for big-endian targets, the order of
    626     // the component parts gets swapped.
    627     if (!Subtarget->isLittleEndian())
    628       std::swap(RtMI, Rt2MI);
    629     // Construct the new load instruction.
    630     MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2;
    631     NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
    632                        TII->get(NewOpc))
    633                    .addOperand(getLdStRegOp(RtNewDest))
    634                    .addOperand(BaseRegOp)
    635                    .addImm(OffsetImm);
    636 
    637     // Copy MachineMemOperands from the original loads.
    638     concatenateMemOperands(NewMemMI, I, Paired);
    639 
    640     DEBUG(
    641         dbgs()
    642         << "Creating the new load and extract. Replacing instructions:\n    ");
    643     DEBUG(I->print(dbgs()));
    644     DEBUG(dbgs() << "    ");
    645     DEBUG(Paired->print(dbgs()));
    646     DEBUG(dbgs() << "  with instructions:\n    ");
    647     DEBUG((NewMemMI)->print(dbgs()));
    648 
    649     int Width = getMemScale(I) == 1 ? 8 : 16;
    650     int LSBLow = 0;
    651     int LSBHigh = Width;
    652     int ImmsLow = LSBLow + Width - 1;
    653     int ImmsHigh = LSBHigh + Width - 1;
    654     MachineInstr *ExtDestMI = MergeForward ? Paired : I;
    655     if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) {
    656       // Create the bitfield extract for high bits.
    657       BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
    658                           TII->get(getBitExtrOpcode(Rt2MI)))
    659                       .addOperand(getLdStRegOp(Rt2MI))
    660                       .addReg(getLdStRegOp(RtNewDest).getReg())
    661                       .addImm(LSBHigh)
    662                       .addImm(ImmsHigh);
    663       // Create the bitfield extract for low bits.
    664       if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
    665         // For unsigned, prefer to use AND for low bits.
    666         BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
    667                             TII->get(AArch64::ANDWri))
    668                         .addOperand(getLdStRegOp(RtMI))
    669                         .addReg(getLdStRegOp(RtNewDest).getReg())
    670                         .addImm(ImmsLow);
    671       } else {
    672         BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
    673                             TII->get(getBitExtrOpcode(RtMI)))
    674                         .addOperand(getLdStRegOp(RtMI))
    675                         .addReg(getLdStRegOp(RtNewDest).getReg())
    676                         .addImm(LSBLow)
    677                         .addImm(ImmsLow);
    678       }
    679     } else {
    680       // Create the bitfield extract for low bits.
    681       if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
    682         // For unsigned, prefer to use AND for low bits.
    683         BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
    684                             TII->get(AArch64::ANDWri))
    685                         .addOperand(getLdStRegOp(RtMI))
    686                         .addReg(getLdStRegOp(RtNewDest).getReg())
    687                         .addImm(ImmsLow);
    688       } else {
    689         BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
    690                             TII->get(getBitExtrOpcode(RtMI)))
    691                         .addOperand(getLdStRegOp(RtMI))
    692                         .addReg(getLdStRegOp(RtNewDest).getReg())
    693                         .addImm(LSBLow)
    694                         .addImm(ImmsLow);
    695       }
    696 
    697       // Create the bitfield extract for high bits.
    698       BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
    699                           TII->get(getBitExtrOpcode(Rt2MI)))
    700                       .addOperand(getLdStRegOp(Rt2MI))
    701                       .addReg(getLdStRegOp(RtNewDest).getReg())
    702                       .addImm(LSBHigh)
    703                       .addImm(ImmsHigh);
    704     }
    705     DEBUG(dbgs() << "    ");
    706     DEBUG((BitExtMI1)->print(dbgs()));
    707     DEBUG(dbgs() << "    ");
    708     DEBUG((BitExtMI2)->print(dbgs()));
    709     DEBUG(dbgs() << "\n");
    710 
    711     // Erase the old instructions.
    712     I->eraseFromParent();
    713     Paired->eraseFromParent();
    714     return NextI;
    715   }
    716 
    717   // Construct the new instruction.
    718   MachineInstrBuilder MIB;
    719   if (isNarrowStore(Opc)) {
    720     // Change the scaled offset from small to large type.
    721     if (!IsUnscaled) {
    722       assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
    723       OffsetImm /= 2;
    724     }
    725     MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
    726                   TII->get(NewOpc))
    727               .addOperand(getLdStRegOp(I))
    728               .addOperand(BaseRegOp)
    729               .addImm(OffsetImm);
    730     // Copy MachineMemOperands from the original stores.
    731     concatenateMemOperands(MIB, I, Paired);
    732   } else {
    733     // Handle Unscaled
    734     if (IsUnscaled)
    735       OffsetImm /= OffsetStride;
    736     MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
    737                   TII->get(NewOpc))
    738               .addOperand(getLdStRegOp(RtMI))
    739               .addOperand(getLdStRegOp(Rt2MI))
    740               .addOperand(BaseRegOp)
    741               .addImm(OffsetImm);
    742   }
    743 
    744   (void)MIB;
    745 
    746   // FIXME: Do we need/want to copy the mem operands from the source
    747   //        instructions? Probably. What uses them after this?
    748 
    749   DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
    750   DEBUG(I->print(dbgs()));
    751   DEBUG(dbgs() << "    ");
    752   DEBUG(Paired->print(dbgs()));
    753   DEBUG(dbgs() << "  with instruction:\n    ");
    754 
    755   if (SExtIdx != -1) {
    756     // Generate the sign extension for the proper result of the ldp.
    757     // I.e., with X1, that would be:
    758     // %W1<def> = KILL %W1, %X1<imp-def>
    759     // %X1<def> = SBFMXri %X1<kill>, 0, 31
    760     MachineOperand &DstMO = MIB->getOperand(SExtIdx);
    761     // Right now, DstMO has the extended register, since it comes from an
    762     // extended opcode.
    763     unsigned DstRegX = DstMO.getReg();
    764     // Get the W variant of that register.
    765     unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
    766     // Update the result of LDP to use the W instead of the X variant.
    767     DstMO.setReg(DstRegW);
    768     DEBUG(((MachineInstr *)MIB)->print(dbgs()));
    769     DEBUG(dbgs() << "\n");
    770     // Make the machine verifier happy by providing a definition for
    771     // the X register.
    772     // Insert this definition right after the generated LDP, i.e., before
    773     // InsertionPoint.
    774     MachineInstrBuilder MIBKill =
    775         BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
    776                 TII->get(TargetOpcode::KILL), DstRegW)
    777             .addReg(DstRegW)
    778             .addReg(DstRegX, RegState::Define);
    779     MIBKill->getOperand(2).setImplicit();
    780     // Create the sign extension.
    781     MachineInstrBuilder MIBSXTW =
    782         BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
    783                 TII->get(AArch64::SBFMXri), DstRegX)
    784             .addReg(DstRegX)
    785             .addImm(0)
    786             .addImm(31);
    787     (void)MIBSXTW;
    788     DEBUG(dbgs() << "  Extend operand:\n    ");
    789     DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
    790     DEBUG(dbgs() << "\n");
    791   } else {
    792     DEBUG(((MachineInstr *)MIB)->print(dbgs()));
    793     DEBUG(dbgs() << "\n");
    794   }
    795 
    796   // Erase the old instructions.
    797   I->eraseFromParent();
    798   Paired->eraseFromParent();
    799 
    800   return NextI;
    801 }
    802 
    803 /// trackRegDefsUses - Remember what registers the specified instruction uses
    804 /// and modifies.
    805 static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs,
    806                              BitVector &UsedRegs,
    807                              const TargetRegisterInfo *TRI) {
    808   for (const MachineOperand &MO : MI->operands()) {
    809     if (MO.isRegMask())
    810       ModifiedRegs.setBitsNotInMask(MO.getRegMask());
    811 
    812     if (!MO.isReg())
    813       continue;
    814     unsigned Reg = MO.getReg();
    815     if (MO.isDef()) {
    816       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
    817         ModifiedRegs.set(*AI);
    818     } else {
    819       assert(MO.isUse() && "Reg operand not a def and not a use?!?");
    820       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
    821         UsedRegs.set(*AI);
    822     }
    823   }
    824 }
    825 
    826 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
    827   // Convert the byte-offset used by unscaled into an "element" offset used
    828   // by the scaled pair load/store instructions.
    829   if (IsUnscaled)
    830     Offset /= OffsetStride;
    831 
    832   return Offset <= 63 && Offset >= -64;
    833 }
    834 
    835 // Do alignment, specialized to power of 2 and for signed ints,
    836 // avoiding having to do a C-style cast from uint_64t to int when
    837 // using RoundUpToAlignment from include/llvm/Support/MathExtras.h.
    838 // FIXME: Move this function to include/MathExtras.h?
    839 static int alignTo(int Num, int PowOf2) {
    840   return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
    841 }
    842 
    843 static bool mayAlias(MachineInstr *MIa, MachineInstr *MIb,
    844                      const AArch64InstrInfo *TII) {
    845   // One of the instructions must modify memory.
    846   if (!MIa->mayStore() && !MIb->mayStore())
    847     return false;
    848 
    849   // Both instructions must be memory operations.
    850   if (!MIa->mayLoadOrStore() && !MIb->mayLoadOrStore())
    851     return false;
    852 
    853   return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb);
    854 }
    855 
    856 static bool mayAlias(MachineInstr *MIa,
    857                      SmallVectorImpl<MachineInstr *> &MemInsns,
    858                      const AArch64InstrInfo *TII) {
    859   for (auto &MIb : MemInsns)
    860     if (mayAlias(MIa, MIb, TII))
    861       return true;
    862 
    863   return false;
    864 }
    865 
    866 /// findMatchingInsn - Scan the instructions looking for a load/store that can
    867 /// be combined with the current instruction into a load/store pair.
    868 MachineBasicBlock::iterator
    869 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
    870                                       LdStPairFlags &Flags, unsigned Limit) {
    871   MachineBasicBlock::iterator E = I->getParent()->end();
    872   MachineBasicBlock::iterator MBBI = I;
    873   MachineInstr *FirstMI = I;
    874   ++MBBI;
    875 
    876   unsigned Opc = FirstMI->getOpcode();
    877   bool MayLoad = FirstMI->mayLoad();
    878   bool IsUnscaled = isUnscaledLdSt(FirstMI);
    879   unsigned Reg = getLdStRegOp(FirstMI).getReg();
    880   unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
    881   int Offset = getLdStOffsetOp(FirstMI).getImm();
    882   bool IsNarrowStore = isNarrowStore(Opc);
    883 
    884   // For narrow stores, find only the case where the stored value is WZR.
    885   if (IsNarrowStore && Reg != AArch64::WZR)
    886     return E;
    887 
    888   // Early exit if the first instruction modifies the base register.
    889   // e.g., ldr x0, [x0]
    890   if (FirstMI->modifiesRegister(BaseReg, TRI))
    891     return E;
    892 
    893   // Early exit if the offset if not possible to match. (6 bits of positive
    894   // range, plus allow an extra one in case we find a later insn that matches
    895   // with Offset-1)
    896   int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
    897   if (!(isNarrowLoad(Opc) || IsNarrowStore) &&
    898       !inBoundsForPair(IsUnscaled, Offset, OffsetStride))
    899     return E;
    900 
    901   // Track which registers have been modified and used between the first insn
    902   // (inclusive) and the second insn.
    903   BitVector ModifiedRegs, UsedRegs;
    904   ModifiedRegs.resize(TRI->getNumRegs());
    905   UsedRegs.resize(TRI->getNumRegs());
    906 
    907   // Remember any instructions that read/write memory between FirstMI and MI.
    908   SmallVector<MachineInstr *, 4> MemInsns;
    909 
    910   for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
    911     MachineInstr *MI = MBBI;
    912     // Skip DBG_VALUE instructions. Otherwise debug info can affect the
    913     // optimization by changing how far we scan.
    914     if (MI->isDebugValue())
    915       continue;
    916 
    917     // Now that we know this is a real instruction, count it.
    918     ++Count;
    919 
    920     bool CanMergeOpc = Opc == MI->getOpcode();
    921     Flags.setSExtIdx(-1);
    922     if (!CanMergeOpc) {
    923       bool IsValidLdStrOpc;
    924       unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc);
    925       assert(IsValidLdStrOpc &&
    926              "Given Opc should be a Load or Store with an immediate");
    927       // Opc will be the first instruction in the pair.
    928       Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0);
    929       CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode());
    930     }
    931 
    932     if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) {
    933       assert(MI->mayLoadOrStore() && "Expected memory operation.");
    934       // If we've found another instruction with the same opcode, check to see
    935       // if the base and offset are compatible with our starting instruction.
    936       // These instructions all have scaled immediate operands, so we just
    937       // check for +1/-1. Make sure to check the new instruction offset is
    938       // actually an immediate and not a symbolic reference destined for
    939       // a relocation.
    940       //
    941       // Pairwise instructions have a 7-bit signed offset field. Single insns
    942       // have a 12-bit unsigned offset field. To be a valid combine, the
    943       // final offset must be in range.
    944       unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
    945       int MIOffset = getLdStOffsetOp(MI).getImm();
    946       if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
    947                                    (Offset + OffsetStride == MIOffset))) {
    948         int MinOffset = Offset < MIOffset ? Offset : MIOffset;
    949         // If this is a volatile load/store that otherwise matched, stop looking
    950         // as something is going on that we don't have enough information to
    951         // safely transform. Similarly, stop if we see a hint to avoid pairs.
    952         if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
    953           return E;
    954         // If the resultant immediate offset of merging these instructions
    955         // is out of range for a pairwise instruction, bail and keep looking.
    956         bool MIIsUnscaled = isUnscaledLdSt(MI);
    957         bool IsNarrowLoad = isNarrowLoad(MI->getOpcode());
    958         if (!IsNarrowLoad &&
    959             !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
    960           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    961           MemInsns.push_back(MI);
    962           continue;
    963         }
    964 
    965         if (IsNarrowLoad || IsNarrowStore) {
    966           // If the alignment requirements of the scaled wide load/store
    967           // instruction can't express the offset of the scaled narrow
    968           // input, bail and keep looking.
    969           if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) {
    970             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    971             MemInsns.push_back(MI);
    972             continue;
    973           }
    974         } else {
    975           // If the alignment requirements of the paired (scaled) instruction
    976           // can't express the offset of the unscaled input, bail and keep
    977           // looking.
    978           if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
    979             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    980             MemInsns.push_back(MI);
    981             continue;
    982           }
    983         }
    984         // If the destination register of the loads is the same register, bail
    985         // and keep looking. A load-pair instruction with both destination
    986         // registers the same is UNPREDICTABLE and will result in an exception.
    987         // For narrow stores, allow only when the stored value is the same
    988         // (i.e., WZR).
    989         if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) ||
    990             (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) {
    991           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
    992           MemInsns.push_back(MI);
    993           continue;
    994         }
    995 
    996         // If the Rt of the second instruction was not modified or used between
    997         // the two instructions and none of the instructions between the second
    998         // and first alias with the second, we can combine the second into the
    999         // first.
   1000         if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
   1001             !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
   1002             !mayAlias(MI, MemInsns, TII)) {
   1003           Flags.setMergeForward(false);
   1004           return MBBI;
   1005         }
   1006 
   1007         // Likewise, if the Rt of the first instruction is not modified or used
   1008         // between the two instructions and none of the instructions between the
   1009         // first and the second alias with the first, we can combine the first
   1010         // into the second.
   1011         if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
   1012             !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
   1013             !mayAlias(FirstMI, MemInsns, TII)) {
   1014           Flags.setMergeForward(true);
   1015           return MBBI;
   1016         }
   1017         // Unable to combine these instructions due to interference in between.
   1018         // Keep looking.
   1019       }
   1020     }
   1021 
   1022     // If the instruction wasn't a matching load or store.  Stop searching if we
   1023     // encounter a call instruction that might modify memory.
   1024     if (MI->isCall())
   1025       return E;
   1026 
   1027     // Update modified / uses register lists.
   1028     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
   1029 
   1030     // Otherwise, if the base register is modified, we have no match, so
   1031     // return early.
   1032     if (ModifiedRegs[BaseReg])
   1033       return E;
   1034 
   1035     // Update list of instructions that read/write memory.
   1036     if (MI->mayLoadOrStore())
   1037       MemInsns.push_back(MI);
   1038   }
   1039   return E;
   1040 }
   1041 
   1042 MachineBasicBlock::iterator
   1043 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
   1044                                      MachineBasicBlock::iterator Update,
   1045                                      bool IsPreIdx) {
   1046   assert((Update->getOpcode() == AArch64::ADDXri ||
   1047           Update->getOpcode() == AArch64::SUBXri) &&
   1048          "Unexpected base register update instruction to merge!");
   1049   MachineBasicBlock::iterator NextI = I;
   1050   // Return the instruction following the merged instruction, which is
   1051   // the instruction following our unmerged load. Unless that's the add/sub
   1052   // instruction we're merging, in which case it's the one after that.
   1053   if (++NextI == Update)
   1054     ++NextI;
   1055 
   1056   int Value = Update->getOperand(2).getImm();
   1057   assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
   1058          "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
   1059   if (Update->getOpcode() == AArch64::SUBXri)
   1060     Value = -Value;
   1061 
   1062   unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
   1063                              : getPostIndexedOpcode(I->getOpcode());
   1064   MachineInstrBuilder MIB;
   1065   if (!isPairedLdSt(I)) {
   1066     // Non-paired instruction.
   1067     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
   1068               .addOperand(getLdStRegOp(Update))
   1069               .addOperand(getLdStRegOp(I))
   1070               .addOperand(getLdStBaseOp(I))
   1071               .addImm(Value);
   1072   } else {
   1073     // Paired instruction.
   1074     int Scale = getMemScale(I);
   1075     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
   1076               .addOperand(getLdStRegOp(Update))
   1077               .addOperand(getLdStRegOp(I, 0))
   1078               .addOperand(getLdStRegOp(I, 1))
   1079               .addOperand(getLdStBaseOp(I))
   1080               .addImm(Value / Scale);
   1081   }
   1082   (void)MIB;
   1083 
   1084   if (IsPreIdx)
   1085     DEBUG(dbgs() << "Creating pre-indexed load/store.");
   1086   else
   1087     DEBUG(dbgs() << "Creating post-indexed load/store.");
   1088   DEBUG(dbgs() << "    Replacing instructions:\n    ");
   1089   DEBUG(I->print(dbgs()));
   1090   DEBUG(dbgs() << "    ");
   1091   DEBUG(Update->print(dbgs()));
   1092   DEBUG(dbgs() << "  with instruction:\n    ");
   1093   DEBUG(((MachineInstr *)MIB)->print(dbgs()));
   1094   DEBUG(dbgs() << "\n");
   1095 
   1096   // Erase the old instructions for the block.
   1097   I->eraseFromParent();
   1098   Update->eraseFromParent();
   1099 
   1100   return NextI;
   1101 }
   1102 
   1103 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI,
   1104                                                MachineInstr *MI,
   1105                                                unsigned BaseReg, int Offset) {
   1106   switch (MI->getOpcode()) {
   1107   default:
   1108     break;
   1109   case AArch64::SUBXri:
   1110     // Negate the offset for a SUB instruction.
   1111     Offset *= -1;
   1112   // FALLTHROUGH
   1113   case AArch64::ADDXri:
   1114     // Make sure it's a vanilla immediate operand, not a relocation or
   1115     // anything else we can't handle.
   1116     if (!MI->getOperand(2).isImm())
   1117       break;
   1118     // Watch out for 1 << 12 shifted value.
   1119     if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm()))
   1120       break;
   1121 
   1122     // The update instruction source and destination register must be the
   1123     // same as the load/store base register.
   1124     if (MI->getOperand(0).getReg() != BaseReg ||
   1125         MI->getOperand(1).getReg() != BaseReg)
   1126       break;
   1127 
   1128     bool IsPairedInsn = isPairedLdSt(MemMI);
   1129     int UpdateOffset = MI->getOperand(2).getImm();
   1130     // For non-paired load/store instructions, the immediate must fit in a
   1131     // signed 9-bit integer.
   1132     if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
   1133       break;
   1134 
   1135     // For paired load/store instructions, the immediate must be a multiple of
   1136     // the scaling factor.  The scaled offset must also fit into a signed 7-bit
   1137     // integer.
   1138     if (IsPairedInsn) {
   1139       int Scale = getMemScale(MemMI);
   1140       if (UpdateOffset % Scale != 0)
   1141         break;
   1142 
   1143       int ScaledOffset = UpdateOffset / Scale;
   1144       if (ScaledOffset > 64 || ScaledOffset < -64)
   1145         break;
   1146     }
   1147 
   1148     // If we have a non-zero Offset, we check that it matches the amount
   1149     // we're adding to the register.
   1150     if (!Offset || Offset == MI->getOperand(2).getImm())
   1151       return true;
   1152     break;
   1153   }
   1154   return false;
   1155 }
   1156 
   1157 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
   1158     MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) {
   1159   MachineBasicBlock::iterator E = I->getParent()->end();
   1160   MachineInstr *MemMI = I;
   1161   MachineBasicBlock::iterator MBBI = I;
   1162 
   1163   unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
   1164   int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
   1165 
   1166   // Scan forward looking for post-index opportunities.  Updating instructions
   1167   // can't be formed if the memory instruction doesn't have the offset we're
   1168   // looking for.
   1169   if (MIUnscaledOffset != UnscaledOffset)
   1170     return E;
   1171 
   1172   // If the base register overlaps a destination register, we can't
   1173   // merge the update.
   1174   bool IsPairedInsn = isPairedLdSt(MemMI);
   1175   for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
   1176     unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
   1177     if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
   1178       return E;
   1179   }
   1180 
   1181   // Track which registers have been modified and used between the first insn
   1182   // (inclusive) and the second insn.
   1183   BitVector ModifiedRegs, UsedRegs;
   1184   ModifiedRegs.resize(TRI->getNumRegs());
   1185   UsedRegs.resize(TRI->getNumRegs());
   1186   ++MBBI;
   1187   for (unsigned Count = 0; MBBI != E; ++MBBI) {
   1188     MachineInstr *MI = MBBI;
   1189     // Skip DBG_VALUE instructions. Otherwise debug info can affect the
   1190     // optimization by changing how far we scan.
   1191     if (MI->isDebugValue())
   1192       continue;
   1193 
   1194     // Now that we know this is a real instruction, count it.
   1195     ++Count;
   1196 
   1197     // If we found a match, return it.
   1198     if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset))
   1199       return MBBI;
   1200 
   1201     // Update the status of what the instruction clobbered and used.
   1202     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
   1203 
   1204     // Otherwise, if the base register is used or modified, we have no match, so
   1205     // return early.
   1206     if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
   1207       return E;
   1208   }
   1209   return E;
   1210 }
   1211 
   1212 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
   1213     MachineBasicBlock::iterator I, unsigned Limit) {
   1214   MachineBasicBlock::iterator B = I->getParent()->begin();
   1215   MachineBasicBlock::iterator E = I->getParent()->end();
   1216   MachineInstr *MemMI = I;
   1217   MachineBasicBlock::iterator MBBI = I;
   1218 
   1219   unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
   1220   int Offset = getLdStOffsetOp(MemMI).getImm();
   1221 
   1222   // If the load/store is the first instruction in the block, there's obviously
   1223   // not any matching update. Ditto if the memory offset isn't zero.
   1224   if (MBBI == B || Offset != 0)
   1225     return E;
   1226   // If the base register overlaps a destination register, we can't
   1227   // merge the update.
   1228   bool IsPairedInsn = isPairedLdSt(MemMI);
   1229   for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
   1230     unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
   1231     if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
   1232       return E;
   1233   }
   1234 
   1235   // Track which registers have been modified and used between the first insn
   1236   // (inclusive) and the second insn.
   1237   BitVector ModifiedRegs, UsedRegs;
   1238   ModifiedRegs.resize(TRI->getNumRegs());
   1239   UsedRegs.resize(TRI->getNumRegs());
   1240   --MBBI;
   1241   for (unsigned Count = 0; MBBI != B; --MBBI) {
   1242     MachineInstr *MI = MBBI;
   1243     // Skip DBG_VALUE instructions. Otherwise debug info can affect the
   1244     // optimization by changing how far we scan.
   1245     if (MI->isDebugValue())
   1246       continue;
   1247 
   1248     // Now that we know this is a real instruction, count it.
   1249     ++Count;
   1250 
   1251     // If we found a match, return it.
   1252     if (isMatchingUpdateInsn(I, MI, BaseReg, Offset))
   1253       return MBBI;
   1254 
   1255     // Update the status of what the instruction clobbered and used.
   1256     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
   1257 
   1258     // Otherwise, if the base register is used or modified, we have no match, so
   1259     // return early.
   1260     if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
   1261       return E;
   1262   }
   1263   return E;
   1264 }
   1265 
   1266 bool AArch64LoadStoreOpt::tryToMergeLdStInst(
   1267     MachineBasicBlock::iterator &MBBI) {
   1268   MachineInstr *MI = MBBI;
   1269   MachineBasicBlock::iterator E = MI->getParent()->end();
   1270   // If this is a volatile load/store, don't mess with it.
   1271   if (MI->hasOrderedMemoryRef())
   1272     return false;
   1273 
   1274   // Make sure this is a reg+imm (as opposed to an address reloc).
   1275   if (!getLdStOffsetOp(MI).isImm())
   1276     return false;
   1277 
   1278   // Check if this load/store has a hint to avoid pair formation.
   1279   // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
   1280   if (TII->isLdStPairSuppressed(MI))
   1281     return false;
   1282 
   1283   // Look ahead up to ScanLimit instructions for a pairable instruction.
   1284   LdStPairFlags Flags;
   1285   MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit);
   1286   if (Paired != E) {
   1287     if (isNarrowLoad(MI)) {
   1288       ++NumNarrowLoadsPromoted;
   1289     } else if (isNarrowStore(MI)) {
   1290       ++NumZeroStoresPromoted;
   1291     } else {
   1292       ++NumPairCreated;
   1293       if (isUnscaledLdSt(MI))
   1294         ++NumUnscaledPairCreated;
   1295     }
   1296 
   1297     // Merge the loads into a pair. Keeping the iterator straight is a
   1298     // pain, so we let the merge routine tell us what the next instruction
   1299     // is after it's done mucking about.
   1300     MBBI = mergePairedInsns(MBBI, Paired, Flags);
   1301     return true;
   1302   }
   1303   return false;
   1304 }
   1305 
   1306 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
   1307                                         bool enableNarrowLdOpt) {
   1308   bool Modified = false;
   1309   // Three tranformations to do here:
   1310   // 1) Find narrow loads that can be converted into a single wider load
   1311   //    with bitfield extract instructions.
   1312   //      e.g.,
   1313   //        ldrh w0, [x2]
   1314   //        ldrh w1, [x2, #2]
   1315   //        ; becomes
   1316   //        ldr w0, [x2]
   1317   //        ubfx w1, w0, #16, #16
   1318   //        and w0, w0, #ffff
   1319   // 2) Find loads and stores that can be merged into a single load or store
   1320   //    pair instruction.
   1321   //      e.g.,
   1322   //        ldr x0, [x2]
   1323   //        ldr x1, [x2, #8]
   1324   //        ; becomes
   1325   //        ldp x0, x1, [x2]
   1326   // 3) Find base register updates that can be merged into the load or store
   1327   //    as a base-reg writeback.
   1328   //      e.g.,
   1329   //        ldr x0, [x2]
   1330   //        add x2, x2, #4
   1331   //        ; becomes
   1332   //        ldr x0, [x2], #4
   1333 
   1334   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
   1335        enableNarrowLdOpt && MBBI != E;) {
   1336     MachineInstr *MI = MBBI;
   1337     switch (MI->getOpcode()) {
   1338     default:
   1339       // Just move on to the next instruction.
   1340       ++MBBI;
   1341       break;
   1342     // Scaled instructions.
   1343     case AArch64::LDRBBui:
   1344     case AArch64::LDRHHui:
   1345     case AArch64::LDRSBWui:
   1346     case AArch64::LDRSHWui:
   1347     case AArch64::STRBBui:
   1348     case AArch64::STRHHui:
   1349     // Unscaled instructions.
   1350     case AArch64::LDURBBi:
   1351     case AArch64::LDURHHi:
   1352     case AArch64::LDURSBWi:
   1353     case AArch64::LDURSHWi:
   1354     case AArch64::STURBBi:
   1355     case AArch64::STURHHi: {
   1356       if (tryToMergeLdStInst(MBBI)) {
   1357         Modified = true;
   1358         break;
   1359       }
   1360       ++MBBI;
   1361       break;
   1362     }
   1363       // FIXME: Do the other instructions.
   1364     }
   1365   }
   1366 
   1367   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
   1368        MBBI != E;) {
   1369     MachineInstr *MI = MBBI;
   1370     switch (MI->getOpcode()) {
   1371     default:
   1372       // Just move on to the next instruction.
   1373       ++MBBI;
   1374       break;
   1375     // Scaled instructions.
   1376     case AArch64::STRSui:
   1377     case AArch64::STRDui:
   1378     case AArch64::STRQui:
   1379     case AArch64::STRXui:
   1380     case AArch64::STRWui:
   1381     case AArch64::LDRSui:
   1382     case AArch64::LDRDui:
   1383     case AArch64::LDRQui:
   1384     case AArch64::LDRXui:
   1385     case AArch64::LDRWui:
   1386     case AArch64::LDRSWui:
   1387     // Unscaled instructions.
   1388     case AArch64::STURSi:
   1389     case AArch64::STURDi:
   1390     case AArch64::STURQi:
   1391     case AArch64::STURWi:
   1392     case AArch64::STURXi:
   1393     case AArch64::LDURSi:
   1394     case AArch64::LDURDi:
   1395     case AArch64::LDURQi:
   1396     case AArch64::LDURWi:
   1397     case AArch64::LDURXi:
   1398     case AArch64::LDURSWi: {
   1399       if (tryToMergeLdStInst(MBBI)) {
   1400         Modified = true;
   1401         break;
   1402       }
   1403       ++MBBI;
   1404       break;
   1405     }
   1406       // FIXME: Do the other instructions.
   1407     }
   1408   }
   1409 
   1410   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
   1411        MBBI != E;) {
   1412     MachineInstr *MI = MBBI;
   1413     // Do update merging. It's simpler to keep this separate from the above
   1414     // switch, though not strictly necessary.
   1415     unsigned Opc = MI->getOpcode();
   1416     switch (Opc) {
   1417     default:
   1418       // Just move on to the next instruction.
   1419       ++MBBI;
   1420       break;
   1421     // Scaled instructions.
   1422     case AArch64::STRSui:
   1423     case AArch64::STRDui:
   1424     case AArch64::STRQui:
   1425     case AArch64::STRXui:
   1426     case AArch64::STRWui:
   1427     case AArch64::STRHHui:
   1428     case AArch64::STRBBui:
   1429     case AArch64::LDRSui:
   1430     case AArch64::LDRDui:
   1431     case AArch64::LDRQui:
   1432     case AArch64::LDRXui:
   1433     case AArch64::LDRWui:
   1434     case AArch64::LDRHHui:
   1435     case AArch64::LDRBBui:
   1436     // Unscaled instructions.
   1437     case AArch64::STURSi:
   1438     case AArch64::STURDi:
   1439     case AArch64::STURQi:
   1440     case AArch64::STURWi:
   1441     case AArch64::STURXi:
   1442     case AArch64::LDURSi:
   1443     case AArch64::LDURDi:
   1444     case AArch64::LDURQi:
   1445     case AArch64::LDURWi:
   1446     case AArch64::LDURXi:
   1447     // Paired instructions.
   1448     case AArch64::LDPSi:
   1449     case AArch64::LDPSWi:
   1450     case AArch64::LDPDi:
   1451     case AArch64::LDPQi:
   1452     case AArch64::LDPWi:
   1453     case AArch64::LDPXi:
   1454     case AArch64::STPSi:
   1455     case AArch64::STPDi:
   1456     case AArch64::STPQi:
   1457     case AArch64::STPWi:
   1458     case AArch64::STPXi: {
   1459       // Make sure this is a reg+imm (as opposed to an address reloc).
   1460       if (!getLdStOffsetOp(MI).isImm()) {
   1461         ++MBBI;
   1462         break;
   1463       }
   1464       // Look forward to try to form a post-index instruction. For example,
   1465       // ldr x0, [x20]
   1466       // add x20, x20, #32
   1467       //   merged into:
   1468       // ldr x0, [x20], #32
   1469       MachineBasicBlock::iterator Update =
   1470           findMatchingUpdateInsnForward(MBBI, ScanLimit, 0);
   1471       if (Update != E) {
   1472         // Merge the update into the ld/st.
   1473         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
   1474         Modified = true;
   1475         ++NumPostFolded;
   1476         break;
   1477       }
   1478       // Don't know how to handle pre/post-index versions, so move to the next
   1479       // instruction.
   1480       if (isUnscaledLdSt(Opc)) {
   1481         ++MBBI;
   1482         break;
   1483       }
   1484 
   1485       // Look back to try to find a pre-index instruction. For example,
   1486       // add x0, x0, #8
   1487       // ldr x1, [x0]
   1488       //   merged into:
   1489       // ldr x1, [x0, #8]!
   1490       Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit);
   1491       if (Update != E) {
   1492         // Merge the update into the ld/st.
   1493         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
   1494         Modified = true;
   1495         ++NumPreFolded;
   1496         break;
   1497       }
   1498       // The immediate in the load/store is scaled by the size of the memory
   1499       // operation. The immediate in the add we're looking for,
   1500       // however, is not, so adjust here.
   1501       int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
   1502 
   1503       // Look forward to try to find a post-index instruction. For example,
   1504       // ldr x1, [x0, #64]
   1505       // add x0, x0, #64
   1506       //   merged into:
   1507       // ldr x1, [x0, #64]!
   1508       Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset);
   1509       if (Update != E) {
   1510         // Merge the update into the ld/st.
   1511         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
   1512         Modified = true;
   1513         ++NumPreFolded;
   1514         break;
   1515       }
   1516 
   1517       // Nothing found. Just move to the next instruction.
   1518       ++MBBI;
   1519       break;
   1520     }
   1521       // FIXME: Do the other instructions.
   1522     }
   1523   }
   1524 
   1525   return Modified;
   1526 }
   1527 
   1528 bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {
   1529   bool ProfitableArch = Subtarget->isCortexA57();
   1530   // FIXME: The benefit from converting narrow loads into a wider load could be
   1531   // microarchitectural as it assumes that a single load with two bitfield
   1532   // extracts is cheaper than two narrow loads. Currently, this conversion is
   1533   // enabled only in cortex-a57 on which performance benefits were verified.
   1534   return ProfitableArch && !Subtarget->requiresStrictAlign();
   1535 }
   1536 
   1537 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   1538   Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
   1539   TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
   1540   TRI = Subtarget->getRegisterInfo();
   1541 
   1542   bool Modified = false;
   1543   bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);
   1544   for (auto &MBB : Fn)
   1545     Modified |= optimizeBlock(MBB, enableNarrowLdOpt);
   1546 
   1547   return Modified;
   1548 }
   1549 
   1550 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
   1551 // loads and stores near one another?
   1552 
   1553 /// createAArch64LoadStoreOptimizationPass - returns an instance of the
   1554 /// load / store optimization pass.
   1555 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
   1556   return new AArch64LoadStoreOpt();
   1557 }
   1558