Home | History | Annotate | Download | only in CodeGen
      1 //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // The machine combiner pass uses machine trace metrics to ensure the combined
     11 // instructions does not lengthen the critical path or the resource depth.
     12 //===----------------------------------------------------------------------===//
     13 
     14 #define DEBUG_TYPE "machine-combiner"
     15 
     16 #include "llvm/ADT/Statistic.h"
     17 #include "llvm/ADT/DenseMap.h"
     18 #include "llvm/CodeGen/MachineDominators.h"
     19 #include "llvm/CodeGen/MachineFunction.h"
     20 #include "llvm/CodeGen/MachineFunctionPass.h"
     21 #include "llvm/CodeGen/MachineInstrBuilder.h"
     22 #include "llvm/CodeGen/MachineLoopInfo.h"
     23 #include "llvm/CodeGen/MachineRegisterInfo.h"
     24 #include "llvm/CodeGen/MachineTraceMetrics.h"
     25 #include "llvm/CodeGen/Passes.h"
     26 #include "llvm/CodeGen/TargetSchedule.h"
     27 #include "llvm/Support/CommandLine.h"
     28 #include "llvm/Support/Debug.h"
     29 #include "llvm/Support/raw_ostream.h"
     30 #include "llvm/Target/TargetInstrInfo.h"
     31 #include "llvm/Target/TargetRegisterInfo.h"
     32 #include "llvm/Target/TargetSubtargetInfo.h"
     33 
     34 using namespace llvm;
     35 
     36 STATISTIC(NumInstCombined, "Number of machineinst combined");
     37 
     38 namespace {
     39 class MachineCombiner : public MachineFunctionPass {
     40   const TargetInstrInfo *TII;
     41   const TargetRegisterInfo *TRI;
     42   MCSchedModel SchedModel;
     43   MachineRegisterInfo *MRI;
     44   MachineTraceMetrics *Traces;
     45   MachineTraceMetrics::Ensemble *MinInstr;
     46 
     47   TargetSchedModel TSchedModel;
     48 
     49   /// True if optimizing for code size.
     50   bool OptSize;
     51 
     52 public:
     53   static char ID;
     54   MachineCombiner() : MachineFunctionPass(ID) {
     55     initializeMachineCombinerPass(*PassRegistry::getPassRegistry());
     56   }
     57   void getAnalysisUsage(AnalysisUsage &AU) const override;
     58   bool runOnMachineFunction(MachineFunction &MF) override;
     59   const char *getPassName() const override { return "Machine InstCombiner"; }
     60 
     61 private:
     62   bool doSubstitute(unsigned NewSize, unsigned OldSize);
     63   bool combineInstructions(MachineBasicBlock *);
     64   MachineInstr *getOperandDef(const MachineOperand &MO);
     65   unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
     66                     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
     67                     MachineTraceMetrics::Trace BlockTrace);
     68   unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
     69                       MachineTraceMetrics::Trace BlockTrace);
     70   bool
     71   improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
     72                           MachineTraceMetrics::Trace BlockTrace,
     73                           SmallVectorImpl<MachineInstr *> &InsInstrs,
     74                           DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
     75                           MachineCombinerPattern Pattern);
     76   bool preservesResourceLen(MachineBasicBlock *MBB,
     77                             MachineTraceMetrics::Trace BlockTrace,
     78                             SmallVectorImpl<MachineInstr *> &InsInstrs,
     79                             SmallVectorImpl<MachineInstr *> &DelInstrs);
     80   void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
     81                      SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
     82 };
     83 }
     84 
     85 char MachineCombiner::ID = 0;
     86 char &llvm::MachineCombinerID = MachineCombiner::ID;
     87 
     88 INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner",
     89                       "Machine InstCombiner", false, false)
     90 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
     91 INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner",
     92                     false, false)
     93 
     94 void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
     95   AU.setPreservesCFG();
     96   AU.addPreserved<MachineDominatorTree>();
     97   AU.addPreserved<MachineLoopInfo>();
     98   AU.addRequired<MachineTraceMetrics>();
     99   AU.addPreserved<MachineTraceMetrics>();
    100   MachineFunctionPass::getAnalysisUsage(AU);
    101 }
    102 
    103 MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) {
    104   MachineInstr *DefInstr = nullptr;
    105   // We need a virtual register definition.
    106   if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
    107     DefInstr = MRI->getUniqueVRegDef(MO.getReg());
    108   // PHI's have no depth etc.
    109   if (DefInstr && DefInstr->isPHI())
    110     DefInstr = nullptr;
    111   return DefInstr;
    112 }
    113 
    114 /// Computes depth of instructions in vector \InsInstr.
    115 ///
    116 /// \param InsInstrs is a vector of machine instructions
    117 /// \param InstrIdxForVirtReg is a dense map of virtual register to index
    118 /// of defining machine instruction in \p InsInstrs
    119 /// \param BlockTrace is a trace of machine instructions
    120 ///
    121 /// \returns Depth of last instruction in \InsInstrs ("NewRoot")
    122 unsigned
    123 MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
    124                           DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
    125                           MachineTraceMetrics::Trace BlockTrace) {
    126   SmallVector<unsigned, 16> InstrDepth;
    127   assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
    128          "Missing machine model\n");
    129 
    130   // For each instruction in the new sequence compute the depth based on the
    131   // operands. Use the trace information when possible. For new operands which
    132   // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
    133   for (auto *InstrPtr : InsInstrs) { // for each Use
    134     unsigned IDepth = 0;
    135     DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(); dbgs() << "\n";);
    136     for (const MachineOperand &MO : InstrPtr->operands()) {
    137       // Check for virtual register operand.
    138       if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
    139         continue;
    140       if (!MO.isUse())
    141         continue;
    142       unsigned DepthOp = 0;
    143       unsigned LatencyOp = 0;
    144       DenseMap<unsigned, unsigned>::iterator II =
    145           InstrIdxForVirtReg.find(MO.getReg());
    146       if (II != InstrIdxForVirtReg.end()) {
    147         // Operand is new virtual register not in trace
    148         assert(II->second < InstrDepth.size() && "Bad Index");
    149         MachineInstr *DefInstr = InsInstrs[II->second];
    150         assert(DefInstr &&
    151                "There must be a definition for a new virtual register");
    152         DepthOp = InstrDepth[II->second];
    153         LatencyOp = TSchedModel.computeOperandLatency(
    154             DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
    155             InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
    156       } else {
    157         MachineInstr *DefInstr = getOperandDef(MO);
    158         if (DefInstr) {
    159           DepthOp = BlockTrace.getInstrCycles(DefInstr).Depth;
    160           LatencyOp = TSchedModel.computeOperandLatency(
    161               DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
    162               InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
    163         }
    164       }
    165       IDepth = std::max(IDepth, DepthOp + LatencyOp);
    166     }
    167     InstrDepth.push_back(IDepth);
    168   }
    169   unsigned NewRootIdx = InsInstrs.size() - 1;
    170   return InstrDepth[NewRootIdx];
    171 }
    172 
    173 /// Computes instruction latency as max of latency of defined operands.
    174 ///
    175 /// \param Root is a machine instruction that could be replaced by NewRoot.
    176 /// It is used to compute a more accurate latency information for NewRoot in
    177 /// case there is a dependent instruction in the same trace (\p BlockTrace)
    178 /// \param NewRoot is the instruction for which the latency is computed
    179 /// \param BlockTrace is a trace of machine instructions
    180 ///
    181 /// \returns Latency of \p NewRoot
    182 unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
    183                                      MachineTraceMetrics::Trace BlockTrace) {
    184   assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
    185          "Missing machine model\n");
    186 
    187   // Check each definition in NewRoot and compute the latency
    188   unsigned NewRootLatency = 0;
    189 
    190   for (const MachineOperand &MO : NewRoot->operands()) {
    191     // Check for virtual register operand.
    192     if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
    193       continue;
    194     if (!MO.isDef())
    195       continue;
    196     // Get the first instruction that uses MO
    197     MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg());
    198     RI++;
    199     MachineInstr *UseMO = RI->getParent();
    200     unsigned LatencyOp = 0;
    201     if (UseMO && BlockTrace.isDepInTrace(Root, UseMO)) {
    202       LatencyOp = TSchedModel.computeOperandLatency(
    203           NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO,
    204           UseMO->findRegisterUseOperandIdx(MO.getReg()));
    205     } else {
    206       LatencyOp = TSchedModel.computeInstrLatency(NewRoot);
    207     }
    208     NewRootLatency = std::max(NewRootLatency, LatencyOp);
    209   }
    210   return NewRootLatency;
    211 }
    212 
    213 /// The combiner's goal may differ based on which pattern it is attempting
    214 /// to optimize.
    215 enum class CombinerObjective {
    216   MustReduceDepth, // The data dependency chain must be improved.
    217   Default          // The critical path must not be lengthened.
    218 };
    219 
    220 static CombinerObjective getCombinerObjective(MachineCombinerPattern P) {
    221   // TODO: If C++ ever gets a real enum class, make this part of the
    222   // MachineCombinerPattern class.
    223   switch (P) {
    224   case MachineCombinerPattern::REASSOC_AX_BY:
    225   case MachineCombinerPattern::REASSOC_AX_YB:
    226   case MachineCombinerPattern::REASSOC_XA_BY:
    227   case MachineCombinerPattern::REASSOC_XA_YB:
    228     return CombinerObjective::MustReduceDepth;
    229   default:
    230     return CombinerObjective::Default;
    231   }
    232 }
    233 
    234 /// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
    235 /// The new code sequence ends in MI NewRoot. A necessary condition for the new
    236 /// sequence to replace the old sequence is that it cannot lengthen the critical
    237 /// path. The definition of "improve" may be restricted by specifying that the
    238 /// new path improves the data dependency chain (MustReduceDepth).
    239 bool MachineCombiner::improvesCriticalPathLen(
    240     MachineBasicBlock *MBB, MachineInstr *Root,
    241     MachineTraceMetrics::Trace BlockTrace,
    242     SmallVectorImpl<MachineInstr *> &InsInstrs,
    243     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
    244     MachineCombinerPattern Pattern) {
    245   assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
    246          "Missing machine model\n");
    247   // NewRoot is the last instruction in the \p InsInstrs vector.
    248   unsigned NewRootIdx = InsInstrs.size() - 1;
    249   MachineInstr *NewRoot = InsInstrs[NewRootIdx];
    250 
    251   // Get depth and latency of NewRoot and Root.
    252   unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
    253   unsigned RootDepth = BlockTrace.getInstrCycles(Root).Depth;
    254 
    255   DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n";
    256         dbgs() << " NewRootDepth: " << NewRootDepth << "\n";
    257         dbgs() << " RootDepth: " << RootDepth << "\n");
    258 
    259   // For a transform such as reassociation, the cost equation is
    260   // conservatively calculated so that we must improve the depth (data
    261   // dependency cycles) in the critical path to proceed with the transform.
    262   // Being conservative also protects against inaccuracies in the underlying
    263   // machine trace metrics and CPU models.
    264   if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth)
    265     return NewRootDepth < RootDepth;
    266 
    267   // A more flexible cost calculation for the critical path includes the slack
    268   // of the original code sequence. This may allow the transform to proceed
    269   // even if the instruction depths (data dependency cycles) become worse.
    270   unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace);
    271   unsigned RootLatency = TSchedModel.computeInstrLatency(Root);
    272   unsigned RootSlack = BlockTrace.getInstrSlack(Root);
    273 
    274   DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n";
    275         dbgs() << " RootLatency: " << RootLatency << "\n";
    276         dbgs() << " RootSlack: " << RootSlack << "\n";
    277         dbgs() << " NewRootDepth + NewRootLatency = "
    278                << NewRootDepth + NewRootLatency << "\n";
    279         dbgs() << " RootDepth + RootLatency + RootSlack = "
    280                << RootDepth + RootLatency + RootSlack << "\n";);
    281 
    282   unsigned NewCycleCount = NewRootDepth + NewRootLatency;
    283   unsigned OldCycleCount = RootDepth + RootLatency + RootSlack;
    284 
    285   return NewCycleCount <= OldCycleCount;
    286 }
    287 
    288 /// helper routine to convert instructions into SC
    289 void MachineCombiner::instr2instrSC(
    290     SmallVectorImpl<MachineInstr *> &Instrs,
    291     SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) {
    292   for (auto *InstrPtr : Instrs) {
    293     unsigned Opc = InstrPtr->getOpcode();
    294     unsigned Idx = TII->get(Opc).getSchedClass();
    295     const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx);
    296     InstrsSC.push_back(SC);
    297   }
    298 }
    299 
    300 /// True when the new instructions do not increase resource length
    301 bool MachineCombiner::preservesResourceLen(
    302     MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace,
    303     SmallVectorImpl<MachineInstr *> &InsInstrs,
    304     SmallVectorImpl<MachineInstr *> &DelInstrs) {
    305   if (!TSchedModel.hasInstrSchedModel())
    306     return true;
    307 
    308   // Compute current resource length
    309 
    310   //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
    311   SmallVector <const MachineBasicBlock *, 1> MBBarr;
    312   MBBarr.push_back(MBB);
    313   unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr);
    314 
    315   // Deal with SC rather than Instructions.
    316   SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC;
    317   SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC;
    318 
    319   instr2instrSC(InsInstrs, InsInstrsSC);
    320   instr2instrSC(DelInstrs, DelInstrsSC);
    321 
    322   ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC);
    323   ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC);
    324 
    325   // Compute new resource length.
    326   unsigned ResLenAfterCombine =
    327       BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr);
    328 
    329   DEBUG(dbgs() << "RESOURCE DATA: \n";
    330         dbgs() << " resource len before: " << ResLenBeforeCombine
    331                << " after: " << ResLenAfterCombine << "\n";);
    332 
    333   return ResLenAfterCombine <= ResLenBeforeCombine;
    334 }
    335 
    336 /// \returns true when new instruction sequence should be generated
    337 /// independent if it lengthens critical path or not
    338 bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) {
    339   if (OptSize && (NewSize < OldSize))
    340     return true;
    341   if (!TSchedModel.hasInstrSchedModelOrItineraries())
    342     return true;
    343   return false;
    344 }
    345 
    346 /// Substitute a slow code sequence with a faster one by
    347 /// evaluating instruction combining pattern.
    348 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
    349 /// combining based on machine trace metrics. Only combine a sequence of
    350 /// instructions  when this neither lengthens the critical path nor increases
    351 /// resource pressure. When optimizing for codesize always combine when the new
    352 /// sequence is shorter.
    353 bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
    354   bool Changed = false;
    355   DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
    356 
    357   auto BlockIter = MBB->begin();
    358 
    359   while (BlockIter != MBB->end()) {
    360     auto &MI = *BlockIter++;
    361 
    362     DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";);
    363     SmallVector<MachineCombinerPattern, 16> Patterns;
    364     // The motivating example is:
    365     //
    366     //     MUL  Other        MUL_op1 MUL_op2  Other
    367     //      \    /               \      |    /
    368     //      ADD/SUB      =>        MADD/MSUB
    369     //      (=Root)                (=NewRoot)
    370 
    371     // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
    372     // usually beneficial for code size it unfortunately can hurt performance
    373     // when the ADD is on the critical path, but the MUL is not. With the
    374     // substitution the MUL becomes part of the critical path (in form of the
    375     // MADD) and can lengthen it on architectures where the MADD latency is
    376     // longer than the ADD latency.
    377     //
    378     // For each instruction we check if it can be the root of a combiner
    379     // pattern. Then for each pattern the new code sequence in form of MI is
    380     // generated and evaluated. When the efficiency criteria (don't lengthen
    381     // critical path, don't use more resources) is met the new sequence gets
    382     // hooked up into the basic block before the old sequence is removed.
    383     //
    384     // The algorithm does not try to evaluate all patterns and pick the best.
    385     // This is only an artificial restriction though. In practice there is
    386     // mostly one pattern, and getMachineCombinerPatterns() can order patterns
    387     // based on an internal cost heuristic.
    388 
    389     if (!TII->getMachineCombinerPatterns(MI, Patterns))
    390       continue;
    391 
    392     for (auto P : Patterns) {
    393       SmallVector<MachineInstr *, 16> InsInstrs;
    394       SmallVector<MachineInstr *, 16> DelInstrs;
    395       DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
    396       if (!MinInstr)
    397         MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
    398       MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);
    399       Traces->verifyAnalysis();
    400       TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
    401                                       InstrIdxForVirtReg);
    402       unsigned NewInstCount = InsInstrs.size();
    403       unsigned OldInstCount = DelInstrs.size();
    404       // Found pattern, but did not generate alternative sequence.
    405       // This can happen e.g. when an immediate could not be materialized
    406       // in a single instruction.
    407       if (!NewInstCount)
    408         continue;
    409 
    410       // Substitute when we optimize for codesize and the new sequence has
    411       // fewer instructions OR
    412       // the new sequence neither lengthens the critical path nor increases
    413       // resource pressure.
    414       if (doSubstitute(NewInstCount, OldInstCount) ||
    415           (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
    416                                    InstrIdxForVirtReg, P) &&
    417            preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
    418         for (auto *InstrPtr : InsInstrs)
    419           MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr);
    420         for (auto *InstrPtr : DelInstrs)
    421           InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();
    422 
    423         Changed = true;
    424         ++NumInstCombined;
    425 
    426         Traces->invalidate(MBB);
    427         Traces->verifyAnalysis();
    428         // Eagerly stop after the first pattern fires.
    429         break;
    430       } else {
    431         // Cleanup instructions of the alternative code sequence. There is no
    432         // use for them.
    433         MachineFunction *MF = MBB->getParent();
    434         for (auto *InstrPtr : InsInstrs)
    435           MF->DeleteMachineInstr(InstrPtr);
    436       }
    437       InstrIdxForVirtReg.clear();
    438     }
    439   }
    440 
    441   return Changed;
    442 }
    443 
    444 bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
    445   const TargetSubtargetInfo &STI = MF.getSubtarget();
    446   TII = STI.getInstrInfo();
    447   TRI = STI.getRegisterInfo();
    448   SchedModel = STI.getSchedModel();
    449   TSchedModel.init(SchedModel, &STI, TII);
    450   MRI = &MF.getRegInfo();
    451   Traces = &getAnalysis<MachineTraceMetrics>();
    452   MinInstr = nullptr;
    453   OptSize = MF.getFunction()->optForSize();
    454 
    455   DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
    456   if (!TII->useMachineCombiner()) {
    457     DEBUG(dbgs() << "  Skipping pass: Target does not support machine combiner\n");
    458     return false;
    459   }
    460 
    461   bool Changed = false;
    462 
    463   // Try to combine instructions.
    464   for (auto &MBB : MF)
    465     Changed |= combineInstructions(&MBB);
    466 
    467   return Changed;
    468 }
    469