1 //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // The machine combiner pass uses machine trace metrics to ensure the combined 11 // instructions does not lengthen the critical path or the resource depth. 12 //===----------------------------------------------------------------------===// 13 14 #define DEBUG_TYPE "machine-combiner" 15 16 #include "llvm/ADT/DenseMap.h" 17 #include "llvm/ADT/Statistic.h" 18 #include "llvm/CodeGen/MachineDominators.h" 19 #include "llvm/CodeGen/MachineFunction.h" 20 #include "llvm/CodeGen/MachineFunctionPass.h" 21 #include "llvm/CodeGen/MachineInstrBuilder.h" 22 #include "llvm/CodeGen/MachineLoopInfo.h" 23 #include "llvm/CodeGen/MachineRegisterInfo.h" 24 #include "llvm/CodeGen/MachineTraceMetrics.h" 25 #include "llvm/CodeGen/Passes.h" 26 #include "llvm/CodeGen/TargetSchedule.h" 27 #include "llvm/Support/Debug.h" 28 #include "llvm/Support/raw_ostream.h" 29 #include "llvm/Target/TargetInstrInfo.h" 30 #include "llvm/Target/TargetRegisterInfo.h" 31 #include "llvm/Target/TargetSubtargetInfo.h" 32 33 using namespace llvm; 34 35 STATISTIC(NumInstCombined, "Number of machineinst combined"); 36 37 namespace { 38 class MachineCombiner : public MachineFunctionPass { 39 const TargetInstrInfo *TII; 40 const TargetRegisterInfo *TRI; 41 MCSchedModel SchedModel; 42 MachineRegisterInfo *MRI; 43 MachineLoopInfo *MLI; // Current MachineLoopInfo 44 MachineTraceMetrics *Traces; 45 MachineTraceMetrics::Ensemble *MinInstr; 46 47 TargetSchedModel TSchedModel; 48 49 /// True if optimizing for code size. 50 bool OptSize; 51 52 public: 53 static char ID; 54 MachineCombiner() : MachineFunctionPass(ID) { 55 initializeMachineCombinerPass(*PassRegistry::getPassRegistry()); 56 } 57 void getAnalysisUsage(AnalysisUsage &AU) const override; 58 bool runOnMachineFunction(MachineFunction &MF) override; 59 const char *getPassName() const override { return "Machine InstCombiner"; } 60 61 private: 62 bool doSubstitute(unsigned NewSize, unsigned OldSize); 63 bool combineInstructions(MachineBasicBlock *); 64 MachineInstr *getOperandDef(const MachineOperand &MO); 65 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 66 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 67 MachineTraceMetrics::Trace BlockTrace); 68 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot, 69 MachineTraceMetrics::Trace BlockTrace); 70 bool 71 improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root, 72 MachineTraceMetrics::Trace BlockTrace, 73 SmallVectorImpl<MachineInstr *> &InsInstrs, 74 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 75 MachineCombinerPattern Pattern); 76 bool preservesResourceLen(MachineBasicBlock *MBB, 77 MachineTraceMetrics::Trace BlockTrace, 78 SmallVectorImpl<MachineInstr *> &InsInstrs, 79 SmallVectorImpl<MachineInstr *> &DelInstrs); 80 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs, 81 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC); 82 }; 83 } 84 85 char MachineCombiner::ID = 0; 86 char &llvm::MachineCombinerID = MachineCombiner::ID; 87 88 INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner", 89 "Machine InstCombiner", false, false) 90 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) 91 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) 92 INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner", 93 false, false) 94 95 void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 96 AU.setPreservesCFG(); 97 AU.addPreserved<MachineDominatorTree>(); 98 AU.addRequired<MachineLoopInfo>(); 99 AU.addPreserved<MachineLoopInfo>(); 100 AU.addRequired<MachineTraceMetrics>(); 101 AU.addPreserved<MachineTraceMetrics>(); 102 MachineFunctionPass::getAnalysisUsage(AU); 103 } 104 105 MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) { 106 MachineInstr *DefInstr = nullptr; 107 // We need a virtual register definition. 108 if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) 109 DefInstr = MRI->getUniqueVRegDef(MO.getReg()); 110 // PHI's have no depth etc. 111 if (DefInstr && DefInstr->isPHI()) 112 DefInstr = nullptr; 113 return DefInstr; 114 } 115 116 /// Computes depth of instructions in vector \InsInstr. 117 /// 118 /// \param InsInstrs is a vector of machine instructions 119 /// \param InstrIdxForVirtReg is a dense map of virtual register to index 120 /// of defining machine instruction in \p InsInstrs 121 /// \param BlockTrace is a trace of machine instructions 122 /// 123 /// \returns Depth of last instruction in \InsInstrs ("NewRoot") 124 unsigned 125 MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, 126 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 127 MachineTraceMetrics::Trace BlockTrace) { 128 SmallVector<unsigned, 16> InstrDepth; 129 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 130 "Missing machine model\n"); 131 132 // For each instruction in the new sequence compute the depth based on the 133 // operands. Use the trace information when possible. For new operands which 134 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth 135 for (auto *InstrPtr : InsInstrs) { // for each Use 136 unsigned IDepth = 0; 137 DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(); dbgs() << "\n";); 138 for (const MachineOperand &MO : InstrPtr->operands()) { 139 // Check for virtual register operand. 140 if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) 141 continue; 142 if (!MO.isUse()) 143 continue; 144 unsigned DepthOp = 0; 145 unsigned LatencyOp = 0; 146 DenseMap<unsigned, unsigned>::iterator II = 147 InstrIdxForVirtReg.find(MO.getReg()); 148 if (II != InstrIdxForVirtReg.end()) { 149 // Operand is new virtual register not in trace 150 assert(II->second < InstrDepth.size() && "Bad Index"); 151 MachineInstr *DefInstr = InsInstrs[II->second]; 152 assert(DefInstr && 153 "There must be a definition for a new virtual register"); 154 DepthOp = InstrDepth[II->second]; 155 LatencyOp = TSchedModel.computeOperandLatency( 156 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), 157 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); 158 } else { 159 MachineInstr *DefInstr = getOperandDef(MO); 160 if (DefInstr) { 161 DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth; 162 LatencyOp = TSchedModel.computeOperandLatency( 163 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), 164 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); 165 } 166 } 167 IDepth = std::max(IDepth, DepthOp + LatencyOp); 168 } 169 InstrDepth.push_back(IDepth); 170 } 171 unsigned NewRootIdx = InsInstrs.size() - 1; 172 return InstrDepth[NewRootIdx]; 173 } 174 175 /// Computes instruction latency as max of latency of defined operands. 176 /// 177 /// \param Root is a machine instruction that could be replaced by NewRoot. 178 /// It is used to compute a more accurate latency information for NewRoot in 179 /// case there is a dependent instruction in the same trace (\p BlockTrace) 180 /// \param NewRoot is the instruction for which the latency is computed 181 /// \param BlockTrace is a trace of machine instructions 182 /// 183 /// \returns Latency of \p NewRoot 184 unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot, 185 MachineTraceMetrics::Trace BlockTrace) { 186 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 187 "Missing machine model\n"); 188 189 // Check each definition in NewRoot and compute the latency 190 unsigned NewRootLatency = 0; 191 192 for (const MachineOperand &MO : NewRoot->operands()) { 193 // Check for virtual register operand. 194 if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) 195 continue; 196 if (!MO.isDef()) 197 continue; 198 // Get the first instruction that uses MO 199 MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg()); 200 RI++; 201 MachineInstr *UseMO = RI->getParent(); 202 unsigned LatencyOp = 0; 203 if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) { 204 LatencyOp = TSchedModel.computeOperandLatency( 205 NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO, 206 UseMO->findRegisterUseOperandIdx(MO.getReg())); 207 } else { 208 LatencyOp = TSchedModel.computeInstrLatency(NewRoot); 209 } 210 NewRootLatency = std::max(NewRootLatency, LatencyOp); 211 } 212 return NewRootLatency; 213 } 214 215 /// The combiner's goal may differ based on which pattern it is attempting 216 /// to optimize. 217 enum class CombinerObjective { 218 MustReduceDepth, // The data dependency chain must be improved. 219 Default // The critical path must not be lengthened. 220 }; 221 222 static CombinerObjective getCombinerObjective(MachineCombinerPattern P) { 223 // TODO: If C++ ever gets a real enum class, make this part of the 224 // MachineCombinerPattern class. 225 switch (P) { 226 case MachineCombinerPattern::REASSOC_AX_BY: 227 case MachineCombinerPattern::REASSOC_AX_YB: 228 case MachineCombinerPattern::REASSOC_XA_BY: 229 case MachineCombinerPattern::REASSOC_XA_YB: 230 return CombinerObjective::MustReduceDepth; 231 default: 232 return CombinerObjective::Default; 233 } 234 } 235 236 /// The DAGCombine code sequence ends in MI (Machine Instruction) Root. 237 /// The new code sequence ends in MI NewRoot. A necessary condition for the new 238 /// sequence to replace the old sequence is that it cannot lengthen the critical 239 /// path. The definition of "improve" may be restricted by specifying that the 240 /// new path improves the data dependency chain (MustReduceDepth). 241 bool MachineCombiner::improvesCriticalPathLen( 242 MachineBasicBlock *MBB, MachineInstr *Root, 243 MachineTraceMetrics::Trace BlockTrace, 244 SmallVectorImpl<MachineInstr *> &InsInstrs, 245 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 246 MachineCombinerPattern Pattern) { 247 assert(TSchedModel.hasInstrSchedModelOrItineraries() && 248 "Missing machine model\n"); 249 // NewRoot is the last instruction in the \p InsInstrs vector. 250 unsigned NewRootIdx = InsInstrs.size() - 1; 251 MachineInstr *NewRoot = InsInstrs[NewRootIdx]; 252 253 // Get depth and latency of NewRoot and Root. 254 unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); 255 unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth; 256 257 DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; 258 dbgs() << " NewRootDepth: " << NewRootDepth << "\n"; 259 dbgs() << " RootDepth: " << RootDepth << "\n"); 260 261 // For a transform such as reassociation, the cost equation is 262 // conservatively calculated so that we must improve the depth (data 263 // dependency cycles) in the critical path to proceed with the transform. 264 // Being conservative also protects against inaccuracies in the underlying 265 // machine trace metrics and CPU models. 266 if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) 267 return NewRootDepth < RootDepth; 268 269 // A more flexible cost calculation for the critical path includes the slack 270 // of the original code sequence. This may allow the transform to proceed 271 // even if the instruction depths (data dependency cycles) become worse. 272 unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace); 273 unsigned RootLatency = TSchedModel.computeInstrLatency(Root); 274 unsigned RootSlack = BlockTrace.getInstrSlack(*Root); 275 276 DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n"; 277 dbgs() << " RootLatency: " << RootLatency << "\n"; 278 dbgs() << " RootSlack: " << RootSlack << "\n"; 279 dbgs() << " NewRootDepth + NewRootLatency = " 280 << NewRootDepth + NewRootLatency << "\n"; 281 dbgs() << " RootDepth + RootLatency + RootSlack = " 282 << RootDepth + RootLatency + RootSlack << "\n";); 283 284 unsigned NewCycleCount = NewRootDepth + NewRootLatency; 285 unsigned OldCycleCount = RootDepth + RootLatency + RootSlack; 286 287 return NewCycleCount <= OldCycleCount; 288 } 289 290 /// helper routine to convert instructions into SC 291 void MachineCombiner::instr2instrSC( 292 SmallVectorImpl<MachineInstr *> &Instrs, 293 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) { 294 for (auto *InstrPtr : Instrs) { 295 unsigned Opc = InstrPtr->getOpcode(); 296 unsigned Idx = TII->get(Opc).getSchedClass(); 297 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx); 298 InstrsSC.push_back(SC); 299 } 300 } 301 302 /// True when the new instructions do not increase resource length 303 bool MachineCombiner::preservesResourceLen( 304 MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, 305 SmallVectorImpl<MachineInstr *> &InsInstrs, 306 SmallVectorImpl<MachineInstr *> &DelInstrs) { 307 if (!TSchedModel.hasInstrSchedModel()) 308 return true; 309 310 // Compute current resource length 311 312 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB); 313 SmallVector <const MachineBasicBlock *, 1> MBBarr; 314 MBBarr.push_back(MBB); 315 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr); 316 317 // Deal with SC rather than Instructions. 318 SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC; 319 SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC; 320 321 instr2instrSC(InsInstrs, InsInstrsSC); 322 instr2instrSC(DelInstrs, DelInstrsSC); 323 324 ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC); 325 ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC); 326 327 // Compute new resource length. 328 unsigned ResLenAfterCombine = 329 BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr); 330 331 DEBUG(dbgs() << "RESOURCE DATA: \n"; 332 dbgs() << " resource len before: " << ResLenBeforeCombine 333 << " after: " << ResLenAfterCombine << "\n";); 334 335 return ResLenAfterCombine <= ResLenBeforeCombine; 336 } 337 338 /// \returns true when new instruction sequence should be generated 339 /// independent if it lengthens critical path or not 340 bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) { 341 if (OptSize && (NewSize < OldSize)) 342 return true; 343 if (!TSchedModel.hasInstrSchedModelOrItineraries()) 344 return true; 345 return false; 346 } 347 348 /// Substitute a slow code sequence with a faster one by 349 /// evaluating instruction combining pattern. 350 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction 351 /// combining based on machine trace metrics. Only combine a sequence of 352 /// instructions when this neither lengthens the critical path nor increases 353 /// resource pressure. When optimizing for codesize always combine when the new 354 /// sequence is shorter. 355 bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { 356 bool Changed = false; 357 DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n"); 358 359 auto BlockIter = MBB->begin(); 360 // Check if the block is in a loop. 361 const MachineLoop *ML = MLI->getLoopFor(MBB); 362 363 while (BlockIter != MBB->end()) { 364 auto &MI = *BlockIter++; 365 366 DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";); 367 SmallVector<MachineCombinerPattern, 16> Patterns; 368 // The motivating example is: 369 // 370 // MUL Other MUL_op1 MUL_op2 Other 371 // \ / \ | / 372 // ADD/SUB => MADD/MSUB 373 // (=Root) (=NewRoot) 374 375 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is 376 // usually beneficial for code size it unfortunately can hurt performance 377 // when the ADD is on the critical path, but the MUL is not. With the 378 // substitution the MUL becomes part of the critical path (in form of the 379 // MADD) and can lengthen it on architectures where the MADD latency is 380 // longer than the ADD latency. 381 // 382 // For each instruction we check if it can be the root of a combiner 383 // pattern. Then for each pattern the new code sequence in form of MI is 384 // generated and evaluated. When the efficiency criteria (don't lengthen 385 // critical path, don't use more resources) is met the new sequence gets 386 // hooked up into the basic block before the old sequence is removed. 387 // 388 // The algorithm does not try to evaluate all patterns and pick the best. 389 // This is only an artificial restriction though. In practice there is 390 // mostly one pattern, and getMachineCombinerPatterns() can order patterns 391 // based on an internal cost heuristic. 392 393 if (!TII->getMachineCombinerPatterns(MI, Patterns)) 394 continue; 395 396 for (auto P : Patterns) { 397 SmallVector<MachineInstr *, 16> InsInstrs; 398 SmallVector<MachineInstr *, 16> DelInstrs; 399 DenseMap<unsigned, unsigned> InstrIdxForVirtReg; 400 if (!MinInstr) 401 MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); 402 MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); 403 Traces->verifyAnalysis(); 404 TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, 405 InstrIdxForVirtReg); 406 unsigned NewInstCount = InsInstrs.size(); 407 unsigned OldInstCount = DelInstrs.size(); 408 // Found pattern, but did not generate alternative sequence. 409 // This can happen e.g. when an immediate could not be materialized 410 // in a single instruction. 411 if (!NewInstCount) 412 continue; 413 414 bool SubstituteAlways = false; 415 if (ML && TII->isThroughputPattern(P)) 416 SubstituteAlways = true; 417 418 // Substitute when we optimize for codesize and the new sequence has 419 // fewer instructions OR 420 // the new sequence neither lengthens the critical path nor increases 421 // resource pressure. 422 if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) || 423 (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, 424 InstrIdxForVirtReg, P) && 425 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { 426 for (auto *InstrPtr : InsInstrs) 427 MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr); 428 for (auto *InstrPtr : DelInstrs) 429 InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); 430 431 Changed = true; 432 ++NumInstCombined; 433 434 Traces->invalidate(MBB); 435 Traces->verifyAnalysis(); 436 // Eagerly stop after the first pattern fires. 437 break; 438 } else { 439 // Cleanup instructions of the alternative code sequence. There is no 440 // use for them. 441 MachineFunction *MF = MBB->getParent(); 442 for (auto *InstrPtr : InsInstrs) 443 MF->DeleteMachineInstr(InstrPtr); 444 } 445 InstrIdxForVirtReg.clear(); 446 } 447 } 448 449 return Changed; 450 } 451 452 bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { 453 const TargetSubtargetInfo &STI = MF.getSubtarget(); 454 TII = STI.getInstrInfo(); 455 TRI = STI.getRegisterInfo(); 456 SchedModel = STI.getSchedModel(); 457 TSchedModel.init(SchedModel, &STI, TII); 458 MRI = &MF.getRegInfo(); 459 MLI = &getAnalysis<MachineLoopInfo>(); 460 Traces = &getAnalysis<MachineTraceMetrics>(); 461 MinInstr = nullptr; 462 OptSize = MF.getFunction()->optForSize(); 463 464 DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n'); 465 if (!TII->useMachineCombiner()) { 466 DEBUG(dbgs() << " Skipping pass: Target does not support machine combiner\n"); 467 return false; 468 } 469 470 bool Changed = false; 471 472 // Try to combine instructions. 473 for (auto &MBB : MF) 474 Changed |= combineInstructions(&MBB); 475 476 return Changed; 477 } 478