Home | History | Annotate | Download | only in CodeGen
      1 //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // The machine combiner pass uses machine trace metrics to ensure the combined
     11 // instructions does not lengthen the critical path or the resource depth.
     12 //===----------------------------------------------------------------------===//
     13 
     14 #define DEBUG_TYPE "machine-combiner"
     15 
     16 #include "llvm/ADT/DenseMap.h"
     17 #include "llvm/ADT/Statistic.h"
     18 #include "llvm/CodeGen/MachineDominators.h"
     19 #include "llvm/CodeGen/MachineFunction.h"
     20 #include "llvm/CodeGen/MachineFunctionPass.h"
     21 #include "llvm/CodeGen/MachineInstrBuilder.h"
     22 #include "llvm/CodeGen/MachineLoopInfo.h"
     23 #include "llvm/CodeGen/MachineRegisterInfo.h"
     24 #include "llvm/CodeGen/MachineTraceMetrics.h"
     25 #include "llvm/CodeGen/Passes.h"
     26 #include "llvm/CodeGen/TargetSchedule.h"
     27 #include "llvm/Support/Debug.h"
     28 #include "llvm/Support/raw_ostream.h"
     29 #include "llvm/Target/TargetInstrInfo.h"
     30 #include "llvm/Target/TargetRegisterInfo.h"
     31 #include "llvm/Target/TargetSubtargetInfo.h"
     32 
     33 using namespace llvm;
     34 
     35 STATISTIC(NumInstCombined, "Number of machineinst combined");
     36 
     37 namespace {
     38 class MachineCombiner : public MachineFunctionPass {
     39   const TargetInstrInfo *TII;
     40   const TargetRegisterInfo *TRI;
     41   MCSchedModel SchedModel;
     42   MachineRegisterInfo *MRI;
     43   MachineLoopInfo *MLI; // Current MachineLoopInfo
     44   MachineTraceMetrics *Traces;
     45   MachineTraceMetrics::Ensemble *MinInstr;
     46 
     47   TargetSchedModel TSchedModel;
     48 
     49   /// True if optimizing for code size.
     50   bool OptSize;
     51 
     52 public:
     53   static char ID;
     54   MachineCombiner() : MachineFunctionPass(ID) {
     55     initializeMachineCombinerPass(*PassRegistry::getPassRegistry());
     56   }
     57   void getAnalysisUsage(AnalysisUsage &AU) const override;
     58   bool runOnMachineFunction(MachineFunction &MF) override;
     59   const char *getPassName() const override { return "Machine InstCombiner"; }
     60 
     61 private:
     62   bool doSubstitute(unsigned NewSize, unsigned OldSize);
     63   bool combineInstructions(MachineBasicBlock *);
     64   MachineInstr *getOperandDef(const MachineOperand &MO);
     65   unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
     66                     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
     67                     MachineTraceMetrics::Trace BlockTrace);
     68   unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
     69                       MachineTraceMetrics::Trace BlockTrace);
     70   bool
     71   improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
     72                           MachineTraceMetrics::Trace BlockTrace,
     73                           SmallVectorImpl<MachineInstr *> &InsInstrs,
     74                           DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
     75                           MachineCombinerPattern Pattern);
     76   bool preservesResourceLen(MachineBasicBlock *MBB,
     77                             MachineTraceMetrics::Trace BlockTrace,
     78                             SmallVectorImpl<MachineInstr *> &InsInstrs,
     79                             SmallVectorImpl<MachineInstr *> &DelInstrs);
     80   void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
     81                      SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
     82 };
     83 }
     84 
     85 char MachineCombiner::ID = 0;
     86 char &llvm::MachineCombinerID = MachineCombiner::ID;
     87 
     88 INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner",
     89                       "Machine InstCombiner", false, false)
     90 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
     91 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
     92 INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner",
     93                     false, false)
     94 
     95 void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
     96   AU.setPreservesCFG();
     97   AU.addPreserved<MachineDominatorTree>();
     98   AU.addRequired<MachineLoopInfo>();
     99   AU.addPreserved<MachineLoopInfo>();
    100   AU.addRequired<MachineTraceMetrics>();
    101   AU.addPreserved<MachineTraceMetrics>();
    102   MachineFunctionPass::getAnalysisUsage(AU);
    103 }
    104 
    105 MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) {
    106   MachineInstr *DefInstr = nullptr;
    107   // We need a virtual register definition.
    108   if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
    109     DefInstr = MRI->getUniqueVRegDef(MO.getReg());
    110   // PHI's have no depth etc.
    111   if (DefInstr && DefInstr->isPHI())
    112     DefInstr = nullptr;
    113   return DefInstr;
    114 }
    115 
    116 /// Computes depth of instructions in vector \InsInstr.
    117 ///
    118 /// \param InsInstrs is a vector of machine instructions
    119 /// \param InstrIdxForVirtReg is a dense map of virtual register to index
    120 /// of defining machine instruction in \p InsInstrs
    121 /// \param BlockTrace is a trace of machine instructions
    122 ///
    123 /// \returns Depth of last instruction in \InsInstrs ("NewRoot")
    124 unsigned
    125 MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
    126                           DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
    127                           MachineTraceMetrics::Trace BlockTrace) {
    128   SmallVector<unsigned, 16> InstrDepth;
    129   assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
    130          "Missing machine model\n");
    131 
    132   // For each instruction in the new sequence compute the depth based on the
    133   // operands. Use the trace information when possible. For new operands which
    134   // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
    135   for (auto *InstrPtr : InsInstrs) { // for each Use
    136     unsigned IDepth = 0;
    137     DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(); dbgs() << "\n";);
    138     for (const MachineOperand &MO : InstrPtr->operands()) {
    139       // Check for virtual register operand.
    140       if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
    141         continue;
    142       if (!MO.isUse())
    143         continue;
    144       unsigned DepthOp = 0;
    145       unsigned LatencyOp = 0;
    146       DenseMap<unsigned, unsigned>::iterator II =
    147           InstrIdxForVirtReg.find(MO.getReg());
    148       if (II != InstrIdxForVirtReg.end()) {
    149         // Operand is new virtual register not in trace
    150         assert(II->second < InstrDepth.size() && "Bad Index");
    151         MachineInstr *DefInstr = InsInstrs[II->second];
    152         assert(DefInstr &&
    153                "There must be a definition for a new virtual register");
    154         DepthOp = InstrDepth[II->second];
    155         LatencyOp = TSchedModel.computeOperandLatency(
    156             DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
    157             InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
    158       } else {
    159         MachineInstr *DefInstr = getOperandDef(MO);
    160         if (DefInstr) {
    161           DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth;
    162           LatencyOp = TSchedModel.computeOperandLatency(
    163               DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
    164               InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
    165         }
    166       }
    167       IDepth = std::max(IDepth, DepthOp + LatencyOp);
    168     }
    169     InstrDepth.push_back(IDepth);
    170   }
    171   unsigned NewRootIdx = InsInstrs.size() - 1;
    172   return InstrDepth[NewRootIdx];
    173 }
    174 
    175 /// Computes instruction latency as max of latency of defined operands.
    176 ///
    177 /// \param Root is a machine instruction that could be replaced by NewRoot.
    178 /// It is used to compute a more accurate latency information for NewRoot in
    179 /// case there is a dependent instruction in the same trace (\p BlockTrace)
    180 /// \param NewRoot is the instruction for which the latency is computed
    181 /// \param BlockTrace is a trace of machine instructions
    182 ///
    183 /// \returns Latency of \p NewRoot
    184 unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
    185                                      MachineTraceMetrics::Trace BlockTrace) {
    186   assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
    187          "Missing machine model\n");
    188 
    189   // Check each definition in NewRoot and compute the latency
    190   unsigned NewRootLatency = 0;
    191 
    192   for (const MachineOperand &MO : NewRoot->operands()) {
    193     // Check for virtual register operand.
    194     if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
    195       continue;
    196     if (!MO.isDef())
    197       continue;
    198     // Get the first instruction that uses MO
    199     MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg());
    200     RI++;
    201     MachineInstr *UseMO = RI->getParent();
    202     unsigned LatencyOp = 0;
    203     if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) {
    204       LatencyOp = TSchedModel.computeOperandLatency(
    205           NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO,
    206           UseMO->findRegisterUseOperandIdx(MO.getReg()));
    207     } else {
    208       LatencyOp = TSchedModel.computeInstrLatency(NewRoot);
    209     }
    210     NewRootLatency = std::max(NewRootLatency, LatencyOp);
    211   }
    212   return NewRootLatency;
    213 }
    214 
    215 /// The combiner's goal may differ based on which pattern it is attempting
    216 /// to optimize.
    217 enum class CombinerObjective {
    218   MustReduceDepth, // The data dependency chain must be improved.
    219   Default          // The critical path must not be lengthened.
    220 };
    221 
    222 static CombinerObjective getCombinerObjective(MachineCombinerPattern P) {
    223   // TODO: If C++ ever gets a real enum class, make this part of the
    224   // MachineCombinerPattern class.
    225   switch (P) {
    226   case MachineCombinerPattern::REASSOC_AX_BY:
    227   case MachineCombinerPattern::REASSOC_AX_YB:
    228   case MachineCombinerPattern::REASSOC_XA_BY:
    229   case MachineCombinerPattern::REASSOC_XA_YB:
    230     return CombinerObjective::MustReduceDepth;
    231   default:
    232     return CombinerObjective::Default;
    233   }
    234 }
    235 
    236 /// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
    237 /// The new code sequence ends in MI NewRoot. A necessary condition for the new
    238 /// sequence to replace the old sequence is that it cannot lengthen the critical
    239 /// path. The definition of "improve" may be restricted by specifying that the
    240 /// new path improves the data dependency chain (MustReduceDepth).
    241 bool MachineCombiner::improvesCriticalPathLen(
    242     MachineBasicBlock *MBB, MachineInstr *Root,
    243     MachineTraceMetrics::Trace BlockTrace,
    244     SmallVectorImpl<MachineInstr *> &InsInstrs,
    245     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
    246     MachineCombinerPattern Pattern) {
    247   assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
    248          "Missing machine model\n");
    249   // NewRoot is the last instruction in the \p InsInstrs vector.
    250   unsigned NewRootIdx = InsInstrs.size() - 1;
    251   MachineInstr *NewRoot = InsInstrs[NewRootIdx];
    252 
    253   // Get depth and latency of NewRoot and Root.
    254   unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
    255   unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
    256 
    257   DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n";
    258         dbgs() << " NewRootDepth: " << NewRootDepth << "\n";
    259         dbgs() << " RootDepth: " << RootDepth << "\n");
    260 
    261   // For a transform such as reassociation, the cost equation is
    262   // conservatively calculated so that we must improve the depth (data
    263   // dependency cycles) in the critical path to proceed with the transform.
    264   // Being conservative also protects against inaccuracies in the underlying
    265   // machine trace metrics and CPU models.
    266   if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth)
    267     return NewRootDepth < RootDepth;
    268 
    269   // A more flexible cost calculation for the critical path includes the slack
    270   // of the original code sequence. This may allow the transform to proceed
    271   // even if the instruction depths (data dependency cycles) become worse.
    272   unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace);
    273   unsigned RootLatency = TSchedModel.computeInstrLatency(Root);
    274   unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
    275 
    276   DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n";
    277         dbgs() << " RootLatency: " << RootLatency << "\n";
    278         dbgs() << " RootSlack: " << RootSlack << "\n";
    279         dbgs() << " NewRootDepth + NewRootLatency = "
    280                << NewRootDepth + NewRootLatency << "\n";
    281         dbgs() << " RootDepth + RootLatency + RootSlack = "
    282                << RootDepth + RootLatency + RootSlack << "\n";);
    283 
    284   unsigned NewCycleCount = NewRootDepth + NewRootLatency;
    285   unsigned OldCycleCount = RootDepth + RootLatency + RootSlack;
    286 
    287   return NewCycleCount <= OldCycleCount;
    288 }
    289 
    290 /// helper routine to convert instructions into SC
    291 void MachineCombiner::instr2instrSC(
    292     SmallVectorImpl<MachineInstr *> &Instrs,
    293     SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) {
    294   for (auto *InstrPtr : Instrs) {
    295     unsigned Opc = InstrPtr->getOpcode();
    296     unsigned Idx = TII->get(Opc).getSchedClass();
    297     const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx);
    298     InstrsSC.push_back(SC);
    299   }
    300 }
    301 
    302 /// True when the new instructions do not increase resource length
    303 bool MachineCombiner::preservesResourceLen(
    304     MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace,
    305     SmallVectorImpl<MachineInstr *> &InsInstrs,
    306     SmallVectorImpl<MachineInstr *> &DelInstrs) {
    307   if (!TSchedModel.hasInstrSchedModel())
    308     return true;
    309 
    310   // Compute current resource length
    311 
    312   //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
    313   SmallVector <const MachineBasicBlock *, 1> MBBarr;
    314   MBBarr.push_back(MBB);
    315   unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr);
    316 
    317   // Deal with SC rather than Instructions.
    318   SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC;
    319   SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC;
    320 
    321   instr2instrSC(InsInstrs, InsInstrsSC);
    322   instr2instrSC(DelInstrs, DelInstrsSC);
    323 
    324   ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC);
    325   ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC);
    326 
    327   // Compute new resource length.
    328   unsigned ResLenAfterCombine =
    329       BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr);
    330 
    331   DEBUG(dbgs() << "RESOURCE DATA: \n";
    332         dbgs() << " resource len before: " << ResLenBeforeCombine
    333                << " after: " << ResLenAfterCombine << "\n";);
    334 
    335   return ResLenAfterCombine <= ResLenBeforeCombine;
    336 }
    337 
    338 /// \returns true when new instruction sequence should be generated
    339 /// independent if it lengthens critical path or not
    340 bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) {
    341   if (OptSize && (NewSize < OldSize))
    342     return true;
    343   if (!TSchedModel.hasInstrSchedModelOrItineraries())
    344     return true;
    345   return false;
    346 }
    347 
    348 /// Substitute a slow code sequence with a faster one by
    349 /// evaluating instruction combining pattern.
    350 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
    351 /// combining based on machine trace metrics. Only combine a sequence of
    352 /// instructions  when this neither lengthens the critical path nor increases
    353 /// resource pressure. When optimizing for codesize always combine when the new
    354 /// sequence is shorter.
    355 bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
    356   bool Changed = false;
    357   DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
    358 
    359   auto BlockIter = MBB->begin();
    360   // Check if the block is in a loop.
    361   const MachineLoop *ML = MLI->getLoopFor(MBB);
    362 
    363   while (BlockIter != MBB->end()) {
    364     auto &MI = *BlockIter++;
    365 
    366     DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";);
    367     SmallVector<MachineCombinerPattern, 16> Patterns;
    368     // The motivating example is:
    369     //
    370     //     MUL  Other        MUL_op1 MUL_op2  Other
    371     //      \    /               \      |    /
    372     //      ADD/SUB      =>        MADD/MSUB
    373     //      (=Root)                (=NewRoot)
    374 
    375     // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
    376     // usually beneficial for code size it unfortunately can hurt performance
    377     // when the ADD is on the critical path, but the MUL is not. With the
    378     // substitution the MUL becomes part of the critical path (in form of the
    379     // MADD) and can lengthen it on architectures where the MADD latency is
    380     // longer than the ADD latency.
    381     //
    382     // For each instruction we check if it can be the root of a combiner
    383     // pattern. Then for each pattern the new code sequence in form of MI is
    384     // generated and evaluated. When the efficiency criteria (don't lengthen
    385     // critical path, don't use more resources) is met the new sequence gets
    386     // hooked up into the basic block before the old sequence is removed.
    387     //
    388     // The algorithm does not try to evaluate all patterns and pick the best.
    389     // This is only an artificial restriction though. In practice there is
    390     // mostly one pattern, and getMachineCombinerPatterns() can order patterns
    391     // based on an internal cost heuristic.
    392 
    393     if (!TII->getMachineCombinerPatterns(MI, Patterns))
    394       continue;
    395 
    396     for (auto P : Patterns) {
    397       SmallVector<MachineInstr *, 16> InsInstrs;
    398       SmallVector<MachineInstr *, 16> DelInstrs;
    399       DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
    400       if (!MinInstr)
    401         MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
    402       MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);
    403       Traces->verifyAnalysis();
    404       TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
    405                                       InstrIdxForVirtReg);
    406       unsigned NewInstCount = InsInstrs.size();
    407       unsigned OldInstCount = DelInstrs.size();
    408       // Found pattern, but did not generate alternative sequence.
    409       // This can happen e.g. when an immediate could not be materialized
    410       // in a single instruction.
    411       if (!NewInstCount)
    412         continue;
    413 
    414       bool SubstituteAlways = false;
    415       if (ML && TII->isThroughputPattern(P))
    416         SubstituteAlways = true;
    417 
    418       // Substitute when we optimize for codesize and the new sequence has
    419       // fewer instructions OR
    420       // the new sequence neither lengthens the critical path nor increases
    421       // resource pressure.
    422       if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) ||
    423           (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
    424                                    InstrIdxForVirtReg, P) &&
    425            preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
    426         for (auto *InstrPtr : InsInstrs)
    427           MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr);
    428         for (auto *InstrPtr : DelInstrs)
    429           InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();
    430 
    431         Changed = true;
    432         ++NumInstCombined;
    433 
    434         Traces->invalidate(MBB);
    435         Traces->verifyAnalysis();
    436         // Eagerly stop after the first pattern fires.
    437         break;
    438       } else {
    439         // Cleanup instructions of the alternative code sequence. There is no
    440         // use for them.
    441         MachineFunction *MF = MBB->getParent();
    442         for (auto *InstrPtr : InsInstrs)
    443           MF->DeleteMachineInstr(InstrPtr);
    444       }
    445       InstrIdxForVirtReg.clear();
    446     }
    447   }
    448 
    449   return Changed;
    450 }
    451 
    452 bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
    453   const TargetSubtargetInfo &STI = MF.getSubtarget();
    454   TII = STI.getInstrInfo();
    455   TRI = STI.getRegisterInfo();
    456   SchedModel = STI.getSchedModel();
    457   TSchedModel.init(SchedModel, &STI, TII);
    458   MRI = &MF.getRegInfo();
    459   MLI = &getAnalysis<MachineLoopInfo>();
    460   Traces = &getAnalysis<MachineTraceMetrics>();
    461   MinInstr = nullptr;
    462   OptSize = MF.getFunction()->optForSize();
    463 
    464   DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
    465   if (!TII->useMachineCombiner()) {
    466     DEBUG(dbgs() << "  Skipping pass: Target does not support machine combiner\n");
    467     return false;
    468   }
    469 
    470   bool Changed = false;
    471 
    472   // Try to combine instructions.
    473   for (auto &MBB : MF)
    474     Changed |= combineInstructions(&MBB);
    475 
    476   return Changed;
    477 }
    478