1 //===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 /// \file 9 //==-----------------------------------------------------------------------===// 10 11 #include "AMDGPU.h" 12 #include "AMDGPUInstrInfo.h" 13 #include "AMDGPUSubtarget.h" 14 #include "R600InstrInfo.h" 15 #include "llvm/ADT/DepthFirstIterator.h" 16 #include "llvm/ADT/SCCIterator.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/CodeGen/MachineDominators.h" 20 #include "llvm/CodeGen/MachineFunction.h" 21 #include "llvm/CodeGen/MachineFunctionAnalysis.h" 22 #include "llvm/CodeGen/MachineFunctionPass.h" 23 #include "llvm/CodeGen/MachineInstrBuilder.h" 24 #include "llvm/CodeGen/MachineJumpTableInfo.h" 25 #include "llvm/CodeGen/MachineLoopInfo.h" 26 #include "llvm/CodeGen/MachinePostDominators.h" 27 #include "llvm/CodeGen/MachineRegisterInfo.h" 28 #include "llvm/IR/Dominators.h" 29 #include "llvm/Support/Debug.h" 30 #include "llvm/Support/raw_ostream.h" 31 #include "llvm/Target/TargetInstrInfo.h" 32 #include "llvm/Target/TargetMachine.h" 33 #include <deque> 34 35 using namespace llvm; 36 37 #define DEBUG_TYPE "structcfg" 38 39 #define DEFAULT_VEC_SLOTS 8 40 41 // TODO: move-begin. 42 43 //===----------------------------------------------------------------------===// 44 // 45 // Statistics for CFGStructurizer. 46 // 47 //===----------------------------------------------------------------------===// 48 49 STATISTIC(numSerialPatternMatch, "CFGStructurizer number of serial pattern " 50 "matched"); 51 STATISTIC(numIfPatternMatch, "CFGStructurizer number of if pattern " 52 "matched"); 53 STATISTIC(numLoopcontPatternMatch, "CFGStructurizer number of loop-continue " 54 "pattern matched"); 55 STATISTIC(numClonedBlock, "CFGStructurizer cloned blocks"); 56 STATISTIC(numClonedInstr, "CFGStructurizer cloned instructions"); 57 58 namespace llvm { 59 void initializeAMDGPUCFGStructurizerPass(PassRegistry&); 60 } 61 62 //===----------------------------------------------------------------------===// 63 // 64 // Miscellaneous utility for CFGStructurizer. 65 // 66 //===----------------------------------------------------------------------===// 67 namespace { 68 #define SHOWNEWINSTR(i) \ 69 DEBUG(dbgs() << "New instr: " << *i << "\n"); 70 71 #define SHOWNEWBLK(b, msg) \ 72 DEBUG( \ 73 dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ 74 dbgs() << "\n"; \ 75 ); 76 77 #define SHOWBLK_DETAIL(b, msg) \ 78 DEBUG( \ 79 if (b) { \ 80 dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ 81 b->print(dbgs()); \ 82 dbgs() << "\n"; \ 83 } \ 84 ); 85 86 #define INVALIDSCCNUM -1 87 88 template<class NodeT> 89 void ReverseVector(SmallVectorImpl<NodeT *> &Src) { 90 size_t sz = Src.size(); 91 for (size_t i = 0; i < sz/2; ++i) { 92 NodeT *t = Src[i]; 93 Src[i] = Src[sz - i - 1]; 94 Src[sz - i - 1] = t; 95 } 96 } 97 98 } // end anonymous namespace 99 100 //===----------------------------------------------------------------------===// 101 // 102 // supporting data structure for CFGStructurizer 103 // 104 //===----------------------------------------------------------------------===// 105 106 107 namespace { 108 109 class BlockInformation { 110 public: 111 bool IsRetired; 112 int SccNum; 113 BlockInformation() : IsRetired(false), SccNum(INVALIDSCCNUM) {} 114 }; 115 116 } // end anonymous namespace 117 118 //===----------------------------------------------------------------------===// 119 // 120 // CFGStructurizer 121 // 122 //===----------------------------------------------------------------------===// 123 124 namespace { 125 class AMDGPUCFGStructurizer : public MachineFunctionPass { 126 public: 127 typedef SmallVector<MachineBasicBlock *, 32> MBBVector; 128 typedef std::map<MachineBasicBlock *, BlockInformation *> MBBInfoMap; 129 typedef std::map<MachineLoop *, MachineBasicBlock *> LoopLandInfoMap; 130 131 enum PathToKind { 132 Not_SinglePath = 0, 133 SinglePath_InPath = 1, 134 SinglePath_NotInPath = 2 135 }; 136 137 static char ID; 138 139 AMDGPUCFGStructurizer() : 140 MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) { 141 initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry()); 142 } 143 144 const char *getPassName() const override { 145 return "AMDGPU Control Flow Graph structurizer Pass"; 146 } 147 148 void getAnalysisUsage(AnalysisUsage &AU) const override { 149 AU.addPreserved<MachineFunctionAnalysis>(); 150 AU.addRequired<MachineFunctionAnalysis>(); 151 AU.addRequired<MachineDominatorTree>(); 152 AU.addRequired<MachinePostDominatorTree>(); 153 AU.addRequired<MachineLoopInfo>(); 154 } 155 156 /// Perform the CFG structurization 157 bool run(); 158 159 /// Perform the CFG preparation 160 /// This step will remove every unconditionnal/dead jump instructions and make 161 /// sure all loops have an exit block 162 bool prepare(); 163 164 bool runOnMachineFunction(MachineFunction &MF) override { 165 TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); 166 TRI = &TII->getRegisterInfo(); 167 DEBUG(MF.dump();); 168 OrderedBlks.clear(); 169 Visited.clear(); 170 FuncRep = &MF; 171 MLI = &getAnalysis<MachineLoopInfo>(); 172 DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI);); 173 MDT = &getAnalysis<MachineDominatorTree>(); 174 DEBUG(MDT->print(dbgs(), (const llvm::Module*)nullptr);); 175 PDT = &getAnalysis<MachinePostDominatorTree>(); 176 DEBUG(PDT->print(dbgs());); 177 prepare(); 178 run(); 179 DEBUG(MF.dump();); 180 return true; 181 } 182 183 protected: 184 MachineDominatorTree *MDT; 185 MachinePostDominatorTree *PDT; 186 MachineLoopInfo *MLI; 187 const R600InstrInfo *TII; 188 const AMDGPURegisterInfo *TRI; 189 190 // PRINT FUNCTIONS 191 /// Print the ordered Blocks. 192 void printOrderedBlocks() const { 193 size_t i = 0; 194 for (MBBVector::const_iterator iterBlk = OrderedBlks.begin(), 195 iterBlkEnd = OrderedBlks.end(); iterBlk != iterBlkEnd; ++iterBlk, ++i) { 196 dbgs() << "BB" << (*iterBlk)->getNumber(); 197 dbgs() << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")"; 198 if (i != 0 && i % 10 == 0) { 199 dbgs() << "\n"; 200 } else { 201 dbgs() << " "; 202 } 203 } 204 } 205 static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) { 206 for (MachineLoop::iterator iter = LoopInfo.begin(), 207 iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) { 208 (*iter)->print(dbgs(), 0); 209 } 210 } 211 212 // UTILITY FUNCTIONS 213 int getSCCNum(MachineBasicBlock *MBB) const; 214 MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const; 215 bool hasBackEdge(MachineBasicBlock *MBB) const; 216 static unsigned getLoopDepth(MachineLoop *LoopRep); 217 bool isRetiredBlock(MachineBasicBlock *MBB) const; 218 bool isActiveLoophead(MachineBasicBlock *MBB) const; 219 PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB, 220 bool AllowSideEntry = true) const; 221 int countActiveBlock(MBBVector::const_iterator It, 222 MBBVector::const_iterator E) const; 223 bool needMigrateBlock(MachineBasicBlock *MBB) const; 224 225 // Utility Functions 226 void reversePredicateSetter(MachineBasicBlock::iterator I); 227 /// Compute the reversed DFS post order of Blocks 228 void orderBlocks(MachineFunction *MF); 229 230 // Function originally from CFGStructTraits 231 void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode, 232 DebugLoc DL = DebugLoc()); 233 MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode, 234 DebugLoc DL = DebugLoc()); 235 MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode); 236 void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode, 237 DebugLoc DL); 238 void insertCondBranchBefore(MachineBasicBlock *MBB, 239 MachineBasicBlock::iterator I, int NewOpcode, int RegNum, 240 DebugLoc DL); 241 void insertCondBranchEnd(MachineBasicBlock *MBB, int NewOpcode, int RegNum); 242 static int getBranchNzeroOpcode(int OldOpcode); 243 static int getBranchZeroOpcode(int OldOpcode); 244 static int getContinueNzeroOpcode(int OldOpcode); 245 static int getContinueZeroOpcode(int OldOpcode); 246 static MachineBasicBlock *getTrueBranch(MachineInstr *MI); 247 static void setTrueBranch(MachineInstr *MI, MachineBasicBlock *MBB); 248 static MachineBasicBlock *getFalseBranch(MachineBasicBlock *MBB, 249 MachineInstr *MI); 250 static bool isCondBranch(MachineInstr *MI); 251 static bool isUncondBranch(MachineInstr *MI); 252 static DebugLoc getLastDebugLocInBB(MachineBasicBlock *MBB); 253 static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *MBB); 254 /// The correct naming for this is getPossibleLoopendBlockBranchInstr. 255 /// 256 /// BB with backward-edge could have move instructions after the branch 257 /// instruction. Such move instruction "belong to" the loop backward-edge. 258 MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB); 259 static MachineInstr *getReturnInstr(MachineBasicBlock *MBB); 260 static MachineInstr *getContinueInstr(MachineBasicBlock *MBB); 261 static bool isReturnBlock(MachineBasicBlock *MBB); 262 static void cloneSuccessorList(MachineBasicBlock *DstMBB, 263 MachineBasicBlock *SrcMBB) ; 264 static MachineBasicBlock *clone(MachineBasicBlock *MBB); 265 /// MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose 266 /// because the AMDGPU instruction is not recognized as terminator fix this 267 /// and retire this routine 268 void replaceInstrUseOfBlockWith(MachineBasicBlock *SrcMBB, 269 MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk); 270 static void wrapup(MachineBasicBlock *MBB); 271 272 273 int patternMatch(MachineBasicBlock *MBB); 274 int patternMatchGroup(MachineBasicBlock *MBB); 275 int serialPatternMatch(MachineBasicBlock *MBB); 276 int ifPatternMatch(MachineBasicBlock *MBB); 277 int loopendPatternMatch(); 278 int mergeLoop(MachineLoop *LoopRep); 279 int loopcontPatternMatch(MachineLoop *LoopRep, MachineBasicBlock *LoopHeader); 280 281 void handleLoopcontBlock(MachineBasicBlock *ContingMBB, 282 MachineLoop *ContingLoop, MachineBasicBlock *ContMBB, 283 MachineLoop *ContLoop); 284 /// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in 285 /// the same loop with LoopLandInfo without explicitly keeping track of 286 /// loopContBlks and loopBreakBlks, this is a method to get the information. 287 bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB, 288 MachineBasicBlock *Src2MBB); 289 int handleJumpintoIf(MachineBasicBlock *HeadMBB, 290 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB); 291 int handleJumpintoIfImp(MachineBasicBlock *HeadMBB, 292 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB); 293 int improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, 294 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, 295 MachineBasicBlock **LandMBBPtr); 296 void showImproveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, 297 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, 298 MachineBasicBlock *LandMBB, bool Detail = false); 299 int cloneOnSideEntryTo(MachineBasicBlock *PreMBB, 300 MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB); 301 void mergeSerialBlock(MachineBasicBlock *DstMBB, 302 MachineBasicBlock *SrcMBB); 303 304 void mergeIfthenelseBlock(MachineInstr *BranchMI, 305 MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB, 306 MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB); 307 void mergeLooplandBlock(MachineBasicBlock *DstMBB, 308 MachineBasicBlock *LandMBB); 309 void mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, 310 MachineBasicBlock *LandMBB); 311 void settleLoopcontBlock(MachineBasicBlock *ContingMBB, 312 MachineBasicBlock *ContMBB); 313 /// normalizeInfiniteLoopExit change 314 /// B1: 315 /// uncond_br LoopHeader 316 /// 317 /// to 318 /// B1: 319 /// cond_br 1 LoopHeader dummyExit 320 /// and return the newly added dummy exit block 321 MachineBasicBlock *normalizeInfiniteLoopExit(MachineLoop *LoopRep); 322 void removeUnconditionalBranch(MachineBasicBlock *MBB); 323 /// Remove duplicate branches instructions in a block. 324 /// For instance 325 /// B0: 326 /// cond_br X B1 B2 327 /// cond_br X B1 B2 328 /// is transformed to 329 /// B0: 330 /// cond_br X B1 B2 331 void removeRedundantConditionalBranch(MachineBasicBlock *MBB); 332 void addDummyExitBlock(SmallVectorImpl<MachineBasicBlock *> &RetMBB); 333 void removeSuccessor(MachineBasicBlock *MBB); 334 MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB, 335 MachineBasicBlock *PredMBB); 336 void migrateInstruction(MachineBasicBlock *SrcMBB, 337 MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I); 338 void recordSccnum(MachineBasicBlock *MBB, int SCCNum); 339 void retireBlock(MachineBasicBlock *MBB); 340 void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = nullptr); 341 342 MachineBasicBlock *findNearestCommonPostDom(std::set<MachineBasicBlock *>&); 343 /// This is work around solution for findNearestCommonDominator not available 344 /// to post dom a proper fix should go to Dominators.h. 345 MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1, 346 MachineBasicBlock *MBB2); 347 348 private: 349 MBBInfoMap BlockInfoMap; 350 LoopLandInfoMap LLInfoMap; 351 std::map<MachineLoop *, bool> Visited; 352 MachineFunction *FuncRep; 353 SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks; 354 }; 355 356 int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const { 357 MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB); 358 if (It == BlockInfoMap.end()) 359 return INVALIDSCCNUM; 360 return (*It).second->SccNum; 361 } 362 363 MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep) 364 const { 365 LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep); 366 if (It == LLInfoMap.end()) 367 return nullptr; 368 return (*It).second; 369 } 370 371 bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const { 372 MachineLoop *LoopRep = MLI->getLoopFor(MBB); 373 if (!LoopRep) 374 return false; 375 MachineBasicBlock *LoopHeader = LoopRep->getHeader(); 376 return MBB->isSuccessor(LoopHeader); 377 } 378 379 unsigned AMDGPUCFGStructurizer::getLoopDepth(MachineLoop *LoopRep) { 380 return LoopRep ? LoopRep->getLoopDepth() : 0; 381 } 382 383 bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const { 384 MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB); 385 if (It == BlockInfoMap.end()) 386 return false; 387 return (*It).second->IsRetired; 388 } 389 390 bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const { 391 MachineLoop *LoopRep = MLI->getLoopFor(MBB); 392 while (LoopRep && LoopRep->getHeader() == MBB) { 393 MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep); 394 if(!LoopLand) 395 return true; 396 if (!isRetiredBlock(LoopLand)) 397 return true; 398 LoopRep = LoopRep->getParentLoop(); 399 } 400 return false; 401 } 402 AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo( 403 MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB, 404 bool AllowSideEntry) const { 405 assert(DstMBB); 406 if (SrcMBB == DstMBB) 407 return SinglePath_InPath; 408 while (SrcMBB && SrcMBB->succ_size() == 1) { 409 SrcMBB = *SrcMBB->succ_begin(); 410 if (SrcMBB == DstMBB) 411 return SinglePath_InPath; 412 if (!AllowSideEntry && SrcMBB->pred_size() > 1) 413 return Not_SinglePath; 414 } 415 if (SrcMBB && SrcMBB->succ_size()==0) 416 return SinglePath_NotInPath; 417 return Not_SinglePath; 418 } 419 420 int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It, 421 MBBVector::const_iterator E) const { 422 int Count = 0; 423 while (It != E) { 424 if (!isRetiredBlock(*It)) 425 ++Count; 426 ++It; 427 } 428 return Count; 429 } 430 431 bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const { 432 unsigned BlockSizeThreshold = 30; 433 unsigned CloneInstrThreshold = 100; 434 bool MultiplePreds = MBB && (MBB->pred_size() > 1); 435 436 if(!MultiplePreds) 437 return false; 438 unsigned BlkSize = MBB->size(); 439 return ((BlkSize > BlockSizeThreshold) && 440 (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold)); 441 } 442 443 void AMDGPUCFGStructurizer::reversePredicateSetter( 444 MachineBasicBlock::iterator I) { 445 while (I--) { 446 if (I->getOpcode() == AMDGPU::PRED_X) { 447 switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) { 448 case OPCODE_IS_ZERO_INT: 449 static_cast<MachineInstr *>(I)->getOperand(2) 450 .setImm(OPCODE_IS_NOT_ZERO_INT); 451 return; 452 case OPCODE_IS_NOT_ZERO_INT: 453 static_cast<MachineInstr *>(I)->getOperand(2) 454 .setImm(OPCODE_IS_ZERO_INT); 455 return; 456 case OPCODE_IS_ZERO: 457 static_cast<MachineInstr *>(I)->getOperand(2) 458 .setImm(OPCODE_IS_NOT_ZERO); 459 return; 460 case OPCODE_IS_NOT_ZERO: 461 static_cast<MachineInstr *>(I)->getOperand(2) 462 .setImm(OPCODE_IS_ZERO); 463 return; 464 default: 465 llvm_unreachable("PRED_X Opcode invalid!"); 466 } 467 } 468 } 469 } 470 471 void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB, 472 int NewOpcode, DebugLoc DL) { 473 MachineInstr *MI = MBB->getParent() 474 ->CreateMachineInstr(TII->get(NewOpcode), DL); 475 MBB->push_back(MI); 476 //assume the instruction doesn't take any reg operand ... 477 SHOWNEWINSTR(MI); 478 } 479 480 MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB, 481 int NewOpcode, DebugLoc DL) { 482 MachineInstr *MI = 483 MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL); 484 if (MBB->begin() != MBB->end()) 485 MBB->insert(MBB->begin(), MI); 486 else 487 MBB->push_back(MI); 488 SHOWNEWINSTR(MI); 489 return MI; 490 } 491 492 MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore( 493 MachineBasicBlock::iterator I, int NewOpcode) { 494 MachineInstr *OldMI = &(*I); 495 MachineBasicBlock *MBB = OldMI->getParent(); 496 MachineInstr *NewMBB = 497 MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DebugLoc()); 498 MBB->insert(I, NewMBB); 499 //assume the instruction doesn't take any reg operand ... 500 SHOWNEWINSTR(NewMBB); 501 return NewMBB; 502 } 503 504 void AMDGPUCFGStructurizer::insertCondBranchBefore( 505 MachineBasicBlock::iterator I, int NewOpcode, DebugLoc DL) { 506 MachineInstr *OldMI = &(*I); 507 MachineBasicBlock *MBB = OldMI->getParent(); 508 MachineFunction *MF = MBB->getParent(); 509 MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL); 510 MBB->insert(I, NewMI); 511 MachineInstrBuilder MIB(*MF, NewMI); 512 MIB.addReg(OldMI->getOperand(1).getReg(), false); 513 SHOWNEWINSTR(NewMI); 514 //erase later oldInstr->eraseFromParent(); 515 } 516 517 void AMDGPUCFGStructurizer::insertCondBranchBefore(MachineBasicBlock *blk, 518 MachineBasicBlock::iterator I, int NewOpcode, int RegNum, 519 DebugLoc DL) { 520 MachineFunction *MF = blk->getParent(); 521 MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL); 522 //insert before 523 blk->insert(I, NewInstr); 524 MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false); 525 SHOWNEWINSTR(NewInstr); 526 } 527 528 void AMDGPUCFGStructurizer::insertCondBranchEnd(MachineBasicBlock *MBB, 529 int NewOpcode, int RegNum) { 530 MachineFunction *MF = MBB->getParent(); 531 MachineInstr *NewInstr = 532 MF->CreateMachineInstr(TII->get(NewOpcode), DebugLoc()); 533 MBB->push_back(NewInstr); 534 MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false); 535 SHOWNEWINSTR(NewInstr); 536 } 537 538 int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) { 539 switch(OldOpcode) { 540 case AMDGPU::JUMP_COND: 541 case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; 542 case AMDGPU::BRANCH_COND_i32: 543 case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32; 544 default: llvm_unreachable("internal error"); 545 } 546 return -1; 547 } 548 549 int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) { 550 switch(OldOpcode) { 551 case AMDGPU::JUMP_COND: 552 case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; 553 case AMDGPU::BRANCH_COND_i32: 554 case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32; 555 default: llvm_unreachable("internal error"); 556 } 557 return -1; 558 } 559 560 int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) { 561 switch(OldOpcode) { 562 case AMDGPU::JUMP_COND: 563 case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32; 564 default: llvm_unreachable("internal error"); 565 }; 566 return -1; 567 } 568 569 int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) { 570 switch(OldOpcode) { 571 case AMDGPU::JUMP_COND: 572 case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32; 573 default: llvm_unreachable("internal error"); 574 } 575 return -1; 576 } 577 578 MachineBasicBlock *AMDGPUCFGStructurizer::getTrueBranch(MachineInstr *MI) { 579 return MI->getOperand(0).getMBB(); 580 } 581 582 void AMDGPUCFGStructurizer::setTrueBranch(MachineInstr *MI, 583 MachineBasicBlock *MBB) { 584 MI->getOperand(0).setMBB(MBB); 585 } 586 587 MachineBasicBlock * 588 AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB, 589 MachineInstr *MI) { 590 assert(MBB->succ_size() == 2); 591 MachineBasicBlock *TrueBranch = getTrueBranch(MI); 592 MachineBasicBlock::succ_iterator It = MBB->succ_begin(); 593 MachineBasicBlock::succ_iterator Next = It; 594 ++Next; 595 return (*It == TrueBranch) ? *Next : *It; 596 } 597 598 bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) { 599 switch (MI->getOpcode()) { 600 case AMDGPU::JUMP_COND: 601 case AMDGPU::BRANCH_COND_i32: 602 case AMDGPU::BRANCH_COND_f32: return true; 603 default: 604 return false; 605 } 606 return false; 607 } 608 609 bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) { 610 switch (MI->getOpcode()) { 611 case AMDGPU::JUMP: 612 case AMDGPU::BRANCH: 613 return true; 614 default: 615 return false; 616 } 617 return false; 618 } 619 620 DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) { 621 //get DebugLoc from the first MachineBasicBlock instruction with debug info 622 DebugLoc DL; 623 for (MachineBasicBlock::iterator It = MBB->begin(); It != MBB->end(); 624 ++It) { 625 MachineInstr *instr = &(*It); 626 if (instr->getDebugLoc()) 627 DL = instr->getDebugLoc(); 628 } 629 return DL; 630 } 631 632 MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr( 633 MachineBasicBlock *MBB) { 634 MachineBasicBlock::reverse_iterator It = MBB->rbegin(); 635 MachineInstr *MI = &*It; 636 if (MI && (isCondBranch(MI) || isUncondBranch(MI))) 637 return MI; 638 return nullptr; 639 } 640 641 MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr( 642 MachineBasicBlock *MBB) { 643 for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend(); 644 It != E; ++It) { 645 // FIXME: Simplify 646 MachineInstr *MI = &*It; 647 if (MI) { 648 if (isCondBranch(MI) || isUncondBranch(MI)) 649 return MI; 650 else if (!TII->isMov(MI->getOpcode())) 651 break; 652 } 653 } 654 return nullptr; 655 } 656 657 MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) { 658 MachineBasicBlock::reverse_iterator It = MBB->rbegin(); 659 if (It != MBB->rend()) { 660 MachineInstr *instr = &(*It); 661 if (instr->getOpcode() == AMDGPU::RETURN) 662 return instr; 663 } 664 return nullptr; 665 } 666 667 MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) { 668 MachineBasicBlock::reverse_iterator It = MBB->rbegin(); 669 if (It != MBB->rend()) { 670 MachineInstr *MI = &(*It); 671 if (MI->getOpcode() == AMDGPU::CONTINUE) 672 return MI; 673 } 674 return nullptr; 675 } 676 677 bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) { 678 MachineInstr *MI = getReturnInstr(MBB); 679 bool IsReturn = (MBB->succ_size() == 0); 680 if (MI) 681 assert(IsReturn); 682 else if (IsReturn) 683 DEBUG( 684 dbgs() << "BB" << MBB->getNumber() 685 <<" is return block without RETURN instr\n";); 686 return IsReturn; 687 } 688 689 void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB, 690 MachineBasicBlock *SrcMBB) { 691 for (MachineBasicBlock::succ_iterator It = SrcMBB->succ_begin(), 692 iterEnd = SrcMBB->succ_end(); It != iterEnd; ++It) 693 DstMBB->addSuccessor(*It); // *iter's predecessor is also taken care of 694 } 695 696 MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) { 697 MachineFunction *Func = MBB->getParent(); 698 MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock(); 699 Func->push_back(NewMBB); //insert to function 700 for (MachineBasicBlock::iterator It = MBB->begin(), E = MBB->end(); 701 It != E; ++It) { 702 MachineInstr *MI = Func->CloneMachineInstr(It); 703 NewMBB->push_back(MI); 704 } 705 return NewMBB; 706 } 707 708 void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith( 709 MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB, 710 MachineBasicBlock *NewBlk) { 711 MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB); 712 if (BranchMI && isCondBranch(BranchMI) && 713 getTrueBranch(BranchMI) == OldMBB) 714 setTrueBranch(BranchMI, NewBlk); 715 } 716 717 void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) { 718 assert((!MBB->getParent()->getJumpTableInfo() 719 || MBB->getParent()->getJumpTableInfo()->isEmpty()) 720 && "found a jump table"); 721 722 //collect continue right before endloop 723 SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> ContInstr; 724 MachineBasicBlock::iterator Pre = MBB->begin(); 725 MachineBasicBlock::iterator E = MBB->end(); 726 MachineBasicBlock::iterator It = Pre; 727 while (It != E) { 728 if (Pre->getOpcode() == AMDGPU::CONTINUE 729 && It->getOpcode() == AMDGPU::ENDLOOP) 730 ContInstr.push_back(Pre); 731 Pre = It; 732 ++It; 733 } 734 735 //delete continue right before endloop 736 for (unsigned i = 0; i < ContInstr.size(); ++i) 737 ContInstr[i]->eraseFromParent(); 738 739 // TODO to fix up jump table so later phase won't be confused. if 740 // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but 741 // there isn't such an interface yet. alternatively, replace all the other 742 // blocks in the jump table with the entryBlk //} 743 744 } 745 746 747 bool AMDGPUCFGStructurizer::prepare() { 748 bool Changed = false; 749 750 //FIXME: if not reducible flow graph, make it so ??? 751 752 DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";); 753 754 orderBlocks(FuncRep); 755 756 SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> RetBlks; 757 758 // Add an ExitBlk to loop that don't have one 759 for (MachineLoopInfo::iterator It = MLI->begin(), 760 E = MLI->end(); It != E; ++It) { 761 MachineLoop *LoopRep = (*It); 762 MBBVector ExitingMBBs; 763 LoopRep->getExitingBlocks(ExitingMBBs); 764 765 if (ExitingMBBs.size() == 0) { 766 MachineBasicBlock* DummyExitBlk = normalizeInfiniteLoopExit(LoopRep); 767 if (DummyExitBlk) 768 RetBlks.push_back(DummyExitBlk); 769 } 770 } 771 772 // Remove unconditional branch instr. 773 // Add dummy exit block iff there are multiple returns. 774 for (SmallVectorImpl<MachineBasicBlock *>::const_iterator 775 It = OrderedBlks.begin(), E = OrderedBlks.end(); It != E; ++It) { 776 MachineBasicBlock *MBB = *It; 777 removeUnconditionalBranch(MBB); 778 removeRedundantConditionalBranch(MBB); 779 if (isReturnBlock(MBB)) { 780 RetBlks.push_back(MBB); 781 } 782 assert(MBB->succ_size() <= 2); 783 } 784 785 if (RetBlks.size() >= 2) { 786 addDummyExitBlock(RetBlks); 787 Changed = true; 788 } 789 790 return Changed; 791 } 792 793 bool AMDGPUCFGStructurizer::run() { 794 795 //Assume reducible CFG... 796 DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n"); 797 798 #ifdef STRESSTEST 799 //Use the worse block ordering to test the algorithm. 800 ReverseVector(orderedBlks); 801 #endif 802 803 DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks();); 804 int NumIter = 0; 805 bool Finish = false; 806 MachineBasicBlock *MBB; 807 bool MakeProgress = false; 808 int NumRemainedBlk = countActiveBlock(OrderedBlks.begin(), 809 OrderedBlks.end()); 810 811 do { 812 ++NumIter; 813 DEBUG( 814 dbgs() << "numIter = " << NumIter 815 << ", numRemaintedBlk = " << NumRemainedBlk << "\n"; 816 ); 817 818 SmallVectorImpl<MachineBasicBlock *>::const_iterator It = 819 OrderedBlks.begin(); 820 SmallVectorImpl<MachineBasicBlock *>::const_iterator E = 821 OrderedBlks.end(); 822 823 SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter = 824 It; 825 MachineBasicBlock *SccBeginMBB = nullptr; 826 int SccNumBlk = 0; // The number of active blocks, init to a 827 // maximum possible number. 828 int SccNumIter; // Number of iteration in this SCC. 829 830 while (It != E) { 831 MBB = *It; 832 833 if (!SccBeginMBB) { 834 SccBeginIter = It; 835 SccBeginMBB = MBB; 836 SccNumIter = 0; 837 SccNumBlk = NumRemainedBlk; // Init to maximum possible number. 838 DEBUG( 839 dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB); 840 dbgs() << "\n"; 841 ); 842 } 843 844 if (!isRetiredBlock(MBB)) 845 patternMatch(MBB); 846 847 ++It; 848 849 bool ContNextScc = true; 850 if (It == E 851 || getSCCNum(SccBeginMBB) != getSCCNum(*It)) { 852 // Just finish one scc. 853 ++SccNumIter; 854 int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It); 855 if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) { 856 DEBUG( 857 dbgs() << "Can't reduce SCC " << getSCCNum(MBB) 858 << ", sccNumIter = " << SccNumIter; 859 dbgs() << "doesn't make any progress\n"; 860 ); 861 ContNextScc = true; 862 } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) { 863 SccNumBlk = sccRemainedNumBlk; 864 It = SccBeginIter; 865 ContNextScc = false; 866 DEBUG( 867 dbgs() << "repeat processing SCC" << getSCCNum(MBB) 868 << "sccNumIter = " << SccNumIter << '\n'; 869 ); 870 } else { 871 // Finish the current scc. 872 ContNextScc = true; 873 } 874 } else { 875 // Continue on next component in the current scc. 876 ContNextScc = false; 877 } 878 879 if (ContNextScc) 880 SccBeginMBB = nullptr; 881 } //while, "one iteration" over the function. 882 883 MachineBasicBlock *EntryMBB = 884 GraphTraits<MachineFunction *>::nodes_begin(FuncRep); 885 if (EntryMBB->succ_size() == 0) { 886 Finish = true; 887 DEBUG( 888 dbgs() << "Reduce to one block\n"; 889 ); 890 } else { 891 int NewnumRemainedBlk 892 = countActiveBlock(OrderedBlks.begin(), OrderedBlks.end()); 893 // consider cloned blocks ?? 894 if (NewnumRemainedBlk == 1 || NewnumRemainedBlk < NumRemainedBlk) { 895 MakeProgress = true; 896 NumRemainedBlk = NewnumRemainedBlk; 897 } else { 898 MakeProgress = false; 899 DEBUG( 900 dbgs() << "No progress\n"; 901 ); 902 } 903 } 904 } while (!Finish && MakeProgress); 905 906 // Misc wrap up to maintain the consistency of the Function representation. 907 wrapup(GraphTraits<MachineFunction *>::nodes_begin(FuncRep)); 908 909 // Detach retired Block, release memory. 910 for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end(); 911 It != E; ++It) { 912 if ((*It).second && (*It).second->IsRetired) { 913 assert(((*It).first)->getNumber() != -1); 914 DEBUG( 915 dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n"; 916 ); 917 (*It).first->eraseFromParent(); //Remove from the parent Function. 918 } 919 delete (*It).second; 920 } 921 BlockInfoMap.clear(); 922 LLInfoMap.clear(); 923 924 if (!Finish) { 925 DEBUG(FuncRep->viewCFG()); 926 llvm_unreachable("IRREDUCIBLE_CFG"); 927 } 928 929 return true; 930 } 931 932 933 934 void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) { 935 int SccNum = 0; 936 MachineBasicBlock *MBB; 937 for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd(); 938 ++It, ++SccNum) { 939 const std::vector<MachineBasicBlock *> &SccNext = *It; 940 for (std::vector<MachineBasicBlock *>::const_iterator 941 blockIter = SccNext.begin(), blockEnd = SccNext.end(); 942 blockIter != blockEnd; ++blockIter) { 943 MBB = *blockIter; 944 OrderedBlks.push_back(MBB); 945 recordSccnum(MBB, SccNum); 946 } 947 } 948 949 //walk through all the block in func to check for unreachable 950 typedef GraphTraits<MachineFunction *> GTM; 951 MachineFunction::iterator It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF); 952 for (; It != E; ++It) { 953 MachineBasicBlock *MBB = &(*It); 954 SccNum = getSCCNum(MBB); 955 if (SccNum == INVALIDSCCNUM) 956 dbgs() << "unreachable block BB" << MBB->getNumber() << "\n"; 957 } 958 } 959 960 int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) { 961 int NumMatch = 0; 962 int CurMatch; 963 964 DEBUG( 965 dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n"; 966 ); 967 968 while ((CurMatch = patternMatchGroup(MBB)) > 0) 969 NumMatch += CurMatch; 970 971 DEBUG( 972 dbgs() << "End patternMatch BB" << MBB->getNumber() 973 << ", numMatch = " << NumMatch << "\n"; 974 ); 975 976 return NumMatch; 977 } 978 979 int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) { 980 int NumMatch = 0; 981 NumMatch += loopendPatternMatch(); 982 NumMatch += serialPatternMatch(MBB); 983 NumMatch += ifPatternMatch(MBB); 984 return NumMatch; 985 } 986 987 988 int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) { 989 if (MBB->succ_size() != 1) 990 return 0; 991 992 MachineBasicBlock *childBlk = *MBB->succ_begin(); 993 if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk)) 994 return 0; 995 996 mergeSerialBlock(MBB, childBlk); 997 ++numSerialPatternMatch; 998 return 1; 999 } 1000 1001 int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) { 1002 //two edges 1003 if (MBB->succ_size() != 2) 1004 return 0; 1005 if (hasBackEdge(MBB)) 1006 return 0; 1007 MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB); 1008 if (!BranchMI) 1009 return 0; 1010 1011 assert(isCondBranch(BranchMI)); 1012 int NumMatch = 0; 1013 1014 MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI); 1015 NumMatch += serialPatternMatch(TrueMBB); 1016 NumMatch += ifPatternMatch(TrueMBB); 1017 MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI); 1018 NumMatch += serialPatternMatch(FalseMBB); 1019 NumMatch += ifPatternMatch(FalseMBB); 1020 MachineBasicBlock *LandBlk; 1021 int Cloned = 0; 1022 1023 assert (!TrueMBB->succ_empty() || !FalseMBB->succ_empty()); 1024 // TODO: Simplify 1025 if (TrueMBB->succ_size() == 1 && FalseMBB->succ_size() == 1 1026 && *TrueMBB->succ_begin() == *FalseMBB->succ_begin()) { 1027 // Diamond pattern 1028 LandBlk = *TrueMBB->succ_begin(); 1029 } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) { 1030 // Triangle pattern, false is empty 1031 LandBlk = FalseMBB; 1032 FalseMBB = nullptr; 1033 } else if (FalseMBB->succ_size() == 1 1034 && *FalseMBB->succ_begin() == TrueMBB) { 1035 // Triangle pattern, true is empty 1036 // We reverse the predicate to make a triangle, empty false pattern; 1037 std::swap(TrueMBB, FalseMBB); 1038 reversePredicateSetter(MBB->end()); 1039 LandBlk = FalseMBB; 1040 FalseMBB = nullptr; 1041 } else if (FalseMBB->succ_size() == 1 1042 && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) { 1043 LandBlk = *FalseMBB->succ_begin(); 1044 } else if (TrueMBB->succ_size() == 1 1045 && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) { 1046 LandBlk = *TrueMBB->succ_begin(); 1047 } else { 1048 return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB); 1049 } 1050 1051 // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the 1052 // new BB created for landBlk==NULL may introduce new challenge to the 1053 // reduction process. 1054 if (LandBlk && 1055 ((TrueMBB && TrueMBB->pred_size() > 1) 1056 || (FalseMBB && FalseMBB->pred_size() > 1))) { 1057 Cloned += improveSimpleJumpintoIf(MBB, TrueMBB, FalseMBB, &LandBlk); 1058 } 1059 1060 if (TrueMBB && TrueMBB->pred_size() > 1) { 1061 TrueMBB = cloneBlockForPredecessor(TrueMBB, MBB); 1062 ++Cloned; 1063 } 1064 1065 if (FalseMBB && FalseMBB->pred_size() > 1) { 1066 FalseMBB = cloneBlockForPredecessor(FalseMBB, MBB); 1067 ++Cloned; 1068 } 1069 1070 mergeIfthenelseBlock(BranchMI, MBB, TrueMBB, FalseMBB, LandBlk); 1071 1072 ++numIfPatternMatch; 1073 1074 numClonedBlock += Cloned; 1075 1076 return 1 + Cloned + NumMatch; 1077 } 1078 1079 int AMDGPUCFGStructurizer::loopendPatternMatch() { 1080 std::deque<MachineLoop *> NestedLoops; 1081 for (auto &It: *MLI) 1082 for (MachineLoop *ML : depth_first(It)) 1083 NestedLoops.push_front(ML); 1084 1085 if (NestedLoops.size() == 0) 1086 return 0; 1087 1088 // Process nested loop outside->inside (we did push_front), 1089 // so "continue" to a outside loop won't be mistaken as "break" 1090 // of the current loop. 1091 int Num = 0; 1092 for (MachineLoop *ExaminedLoop : NestedLoops) { 1093 if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop]) 1094 continue; 1095 DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump();); 1096 int NumBreak = mergeLoop(ExaminedLoop); 1097 if (NumBreak == -1) 1098 break; 1099 Num += NumBreak; 1100 } 1101 return Num; 1102 } 1103 1104 int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) { 1105 MachineBasicBlock *LoopHeader = LoopRep->getHeader(); 1106 MBBVector ExitingMBBs; 1107 LoopRep->getExitingBlocks(ExitingMBBs); 1108 assert(!ExitingMBBs.empty() && "Infinite Loop not supported"); 1109 DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() << " exiting blocks\n";); 1110 // We assume a single ExitBlk 1111 MBBVector ExitBlks; 1112 LoopRep->getExitBlocks(ExitBlks); 1113 SmallPtrSet<MachineBasicBlock *, 2> ExitBlkSet; 1114 for (unsigned i = 0, e = ExitBlks.size(); i < e; ++i) 1115 ExitBlkSet.insert(ExitBlks[i]); 1116 assert(ExitBlkSet.size() == 1); 1117 MachineBasicBlock *ExitBlk = *ExitBlks.begin(); 1118 assert(ExitBlk && "Loop has several exit block"); 1119 MBBVector LatchBlks; 1120 typedef GraphTraits<Inverse<MachineBasicBlock*> > InvMBBTraits; 1121 InvMBBTraits::ChildIteratorType PI = InvMBBTraits::child_begin(LoopHeader), 1122 PE = InvMBBTraits::child_end(LoopHeader); 1123 for (; PI != PE; PI++) { 1124 if (LoopRep->contains(*PI)) 1125 LatchBlks.push_back(*PI); 1126 } 1127 1128 for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i) 1129 mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk); 1130 for (unsigned i = 0, e = LatchBlks.size(); i < e; ++i) 1131 settleLoopcontBlock(LatchBlks[i], LoopHeader); 1132 int Match = 0; 1133 do { 1134 Match = 0; 1135 Match += serialPatternMatch(LoopHeader); 1136 Match += ifPatternMatch(LoopHeader); 1137 } while (Match > 0); 1138 mergeLooplandBlock(LoopHeader, ExitBlk); 1139 MachineLoop *ParentLoop = LoopRep->getParentLoop(); 1140 if (ParentLoop) 1141 MLI->changeLoopFor(LoopHeader, ParentLoop); 1142 else 1143 MLI->removeBlock(LoopHeader); 1144 Visited[LoopRep] = true; 1145 return 1; 1146 } 1147 1148 int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep, 1149 MachineBasicBlock *LoopHeader) { 1150 int NumCont = 0; 1151 SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> ContMBB; 1152 typedef GraphTraits<Inverse<MachineBasicBlock *> > GTIM; 1153 GTIM::ChildIteratorType It = GTIM::child_begin(LoopHeader), 1154 E = GTIM::child_end(LoopHeader); 1155 for (; It != E; ++It) { 1156 MachineBasicBlock *MBB = *It; 1157 if (LoopRep->contains(MBB)) { 1158 handleLoopcontBlock(MBB, MLI->getLoopFor(MBB), 1159 LoopHeader, LoopRep); 1160 ContMBB.push_back(MBB); 1161 ++NumCont; 1162 } 1163 } 1164 1165 for (SmallVectorImpl<MachineBasicBlock *>::iterator It = ContMBB.begin(), 1166 E = ContMBB.end(); It != E; ++It) { 1167 (*It)->removeSuccessor(LoopHeader); 1168 } 1169 1170 numLoopcontPatternMatch += NumCont; 1171 1172 return NumCont; 1173 } 1174 1175 1176 bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak( 1177 MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) { 1178 if (Src1MBB->succ_size() == 0) { 1179 MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB); 1180 if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) { 1181 MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep]; 1182 if (TheEntry) { 1183 DEBUG( 1184 dbgs() << "isLoopContBreakBlock yes src1 = BB" 1185 << Src1MBB->getNumber() 1186 << " src2 = BB" << Src2MBB->getNumber() << "\n"; 1187 ); 1188 return true; 1189 } 1190 } 1191 } 1192 return false; 1193 } 1194 1195 int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB, 1196 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) { 1197 int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB); 1198 if (Num == 0) { 1199 DEBUG( 1200 dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n"; 1201 ); 1202 Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB); 1203 } 1204 return Num; 1205 } 1206 1207 int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB, 1208 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) { 1209 int Num = 0; 1210 MachineBasicBlock *DownBlk; 1211 1212 //trueBlk could be the common post dominator 1213 DownBlk = TrueMBB; 1214 1215 DEBUG( 1216 dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber() 1217 << " true = BB" << TrueMBB->getNumber() 1218 << ", numSucc=" << TrueMBB->succ_size() 1219 << " false = BB" << FalseMBB->getNumber() << "\n"; 1220 ); 1221 1222 while (DownBlk) { 1223 DEBUG( 1224 dbgs() << "check down = BB" << DownBlk->getNumber(); 1225 ); 1226 1227 if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) { 1228 DEBUG( 1229 dbgs() << " working\n"; 1230 ); 1231 1232 Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk); 1233 Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk); 1234 1235 numClonedBlock += Num; 1236 Num += serialPatternMatch(*HeadMBB->succ_begin()); 1237 Num += serialPatternMatch(*std::next(HeadMBB->succ_begin())); 1238 Num += ifPatternMatch(HeadMBB); 1239 assert(Num > 0); 1240 1241 break; 1242 } 1243 DEBUG( 1244 dbgs() << " not working\n"; 1245 ); 1246 DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr; 1247 } // walk down the postDomTree 1248 1249 return Num; 1250 } 1251 1252 void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf( 1253 MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB, 1254 MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) { 1255 dbgs() << "head = BB" << HeadMBB->getNumber() 1256 << " size = " << HeadMBB->size(); 1257 if (Detail) { 1258 dbgs() << "\n"; 1259 HeadMBB->print(dbgs()); 1260 dbgs() << "\n"; 1261 } 1262 1263 if (TrueMBB) { 1264 dbgs() << ", true = BB" << TrueMBB->getNumber() << " size = " 1265 << TrueMBB->size() << " numPred = " << TrueMBB->pred_size(); 1266 if (Detail) { 1267 dbgs() << "\n"; 1268 TrueMBB->print(dbgs()); 1269 dbgs() << "\n"; 1270 } 1271 } 1272 if (FalseMBB) { 1273 dbgs() << ", false = BB" << FalseMBB->getNumber() << " size = " 1274 << FalseMBB->size() << " numPred = " << FalseMBB->pred_size(); 1275 if (Detail) { 1276 dbgs() << "\n"; 1277 FalseMBB->print(dbgs()); 1278 dbgs() << "\n"; 1279 } 1280 } 1281 if (LandMBB) { 1282 dbgs() << ", land = BB" << LandMBB->getNumber() << " size = " 1283 << LandMBB->size() << " numPred = " << LandMBB->pred_size(); 1284 if (Detail) { 1285 dbgs() << "\n"; 1286 LandMBB->print(dbgs()); 1287 dbgs() << "\n"; 1288 } 1289 } 1290 1291 dbgs() << "\n"; 1292 } 1293 1294 int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, 1295 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, 1296 MachineBasicBlock **LandMBBPtr) { 1297 bool MigrateTrue = false; 1298 bool MigrateFalse = false; 1299 1300 MachineBasicBlock *LandBlk = *LandMBBPtr; 1301 1302 assert((!TrueMBB || TrueMBB->succ_size() <= 1) 1303 && (!FalseMBB || FalseMBB->succ_size() <= 1)); 1304 1305 if (TrueMBB == FalseMBB) 1306 return 0; 1307 1308 MigrateTrue = needMigrateBlock(TrueMBB); 1309 MigrateFalse = needMigrateBlock(FalseMBB); 1310 1311 if (!MigrateTrue && !MigrateFalse) 1312 return 0; 1313 1314 // If we need to migrate either trueBlk and falseBlk, migrate the rest that 1315 // have more than one predecessors. without doing this, its predecessor 1316 // rather than headBlk will have undefined value in initReg. 1317 if (!MigrateTrue && TrueMBB && TrueMBB->pred_size() > 1) 1318 MigrateTrue = true; 1319 if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1) 1320 MigrateFalse = true; 1321 1322 DEBUG( 1323 dbgs() << "before improveSimpleJumpintoIf: "; 1324 showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0); 1325 ); 1326 1327 // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk 1328 // 1329 // new: headBlk => if () {initReg = 1; org trueBlk branch} else 1330 // {initReg = 0; org falseBlk branch } 1331 // => landBlk => if (initReg) {org trueBlk} else {org falseBlk} 1332 // => org landBlk 1333 // if landBlk->pred_size() > 2, put the about if-else inside 1334 // if (initReg !=2) {...} 1335 // 1336 // add initReg = initVal to headBlk 1337 1338 const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); 1339 if (!MigrateTrue || !MigrateFalse) { 1340 // XXX: We have an opportunity here to optimize the "branch into if" case 1341 // here. Branch into if looks like this: 1342 // entry 1343 // / | 1344 // diamond_head branch_from 1345 // / \ | 1346 // diamond_false diamond_true 1347 // \ / 1348 // done 1349 // 1350 // The diamond_head block begins the "if" and the diamond_true block 1351 // is the block being "branched into". 1352 // 1353 // If MigrateTrue is true, then TrueBB is the block being "branched into" 1354 // and if MigrateFalse is true, then FalseBB is the block being 1355 // "branched into" 1356 // 1357 // Here is the pseudo code for how I think the optimization should work: 1358 // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head. 1359 // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from. 1360 // 3. Move the branch instruction from diamond_head into its own basic 1361 // block (new_block). 1362 // 4. Add an unconditional branch from diamond_head to new_block 1363 // 5. Replace the branch instruction in branch_from with an unconditional 1364 // branch to new_block. If branch_from has multiple predecessors, then 1365 // we need to replace the True/False block in the branch 1366 // instruction instead of replacing it. 1367 // 6. Change the condition of the branch instruction in new_block from 1368 // COND to (COND || GPR0) 1369 // 1370 // In order insert these MOV instruction, we will need to use the 1371 // RegisterScavenger. Usually liveness stops being tracked during 1372 // the late machine optimization passes, however if we implement 1373 // bool TargetRegisterInfo::requiresRegisterScavenging( 1374 // const MachineFunction &MF) 1375 // and have it return true, liveness will be tracked correctly 1376 // by generic optimization passes. We will also need to make sure that 1377 // all of our target-specific passes that run after regalloc and before 1378 // the CFGStructurizer track liveness and we will need to modify this pass 1379 // to correctly track liveness. 1380 // 1381 // After the above changes, the new CFG should look like this: 1382 // entry 1383 // / | 1384 // diamond_head branch_from 1385 // \ / 1386 // new_block 1387 // / | 1388 // diamond_false diamond_true 1389 // \ / 1390 // done 1391 // 1392 // Without this optimization, we are forced to duplicate the diamond_true 1393 // block and we will end up with a CFG like this: 1394 // 1395 // entry 1396 // / | 1397 // diamond_head branch_from 1398 // / \ | 1399 // diamond_false diamond_true diamond_true (duplicate) 1400 // \ / | 1401 // done --------------------| 1402 // 1403 // Duplicating diamond_true can be very costly especially if it has a 1404 // lot of instructions. 1405 return 0; 1406 } 1407 1408 int NumNewBlk = 0; 1409 1410 bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2); 1411 1412 //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL" 1413 MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF); 1414 1415 if (LandBlkHasOtherPred) { 1416 llvm_unreachable("Extra register needed to handle CFG"); 1417 unsigned CmpResReg = 1418 HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); 1419 llvm_unreachable("Extra compare instruction needed to handle CFG"); 1420 insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, 1421 CmpResReg, DebugLoc()); 1422 } 1423 1424 // XXX: We are running this after RA, so creating virtual registers will 1425 // cause an assertion failure in the PostRA scheduling pass. 1426 unsigned InitReg = 1427 HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); 1428 insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg, 1429 DebugLoc()); 1430 1431 if (MigrateTrue) { 1432 migrateInstruction(TrueMBB, LandBlk, I); 1433 // need to uncondionally insert the assignment to ensure a path from its 1434 // predecessor rather than headBlk has valid value in initReg if 1435 // (initVal != 1). 1436 llvm_unreachable("Extra register needed to handle CFG"); 1437 } 1438 insertInstrBefore(I, AMDGPU::ELSE); 1439 1440 if (MigrateFalse) { 1441 migrateInstruction(FalseMBB, LandBlk, I); 1442 // need to uncondionally insert the assignment to ensure a path from its 1443 // predecessor rather than headBlk has valid value in initReg if 1444 // (initVal != 0) 1445 llvm_unreachable("Extra register needed to handle CFG"); 1446 } 1447 1448 if (LandBlkHasOtherPred) { 1449 // add endif 1450 insertInstrBefore(I, AMDGPU::ENDIF); 1451 1452 // put initReg = 2 to other predecessors of landBlk 1453 for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(), 1454 PE = LandBlk->pred_end(); PI != PE; ++PI) { 1455 MachineBasicBlock *MBB = *PI; 1456 if (MBB != TrueMBB && MBB != FalseMBB) 1457 llvm_unreachable("Extra register needed to handle CFG"); 1458 } 1459 } 1460 DEBUG( 1461 dbgs() << "result from improveSimpleJumpintoIf: "; 1462 showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0); 1463 ); 1464 1465 // update landBlk 1466 *LandMBBPtr = LandBlk; 1467 1468 return NumNewBlk; 1469 } 1470 1471 void AMDGPUCFGStructurizer::handleLoopcontBlock(MachineBasicBlock *ContingMBB, 1472 MachineLoop *ContingLoop, MachineBasicBlock *ContMBB, 1473 MachineLoop *ContLoop) { 1474 DEBUG(dbgs() << "loopcontPattern cont = BB" << ContingMBB->getNumber() 1475 << " header = BB" << ContMBB->getNumber() << "\n"; 1476 dbgs() << "Trying to continue loop-depth = " 1477 << getLoopDepth(ContLoop) 1478 << " from loop-depth = " << getLoopDepth(ContingLoop) << "\n";); 1479 settleLoopcontBlock(ContingMBB, ContMBB); 1480 } 1481 1482 void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB, 1483 MachineBasicBlock *SrcMBB) { 1484 DEBUG( 1485 dbgs() << "serialPattern BB" << DstMBB->getNumber() 1486 << " <= BB" << SrcMBB->getNumber() << "\n"; 1487 ); 1488 DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end()); 1489 1490 DstMBB->removeSuccessor(SrcMBB); 1491 cloneSuccessorList(DstMBB, SrcMBB); 1492 1493 removeSuccessor(SrcMBB); 1494 MLI->removeBlock(SrcMBB); 1495 retireBlock(SrcMBB); 1496 } 1497 1498 void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI, 1499 MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB, 1500 MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) { 1501 assert (TrueMBB); 1502 DEBUG( 1503 dbgs() << "ifPattern BB" << MBB->getNumber(); 1504 dbgs() << "{ "; 1505 if (TrueMBB) { 1506 dbgs() << "BB" << TrueMBB->getNumber(); 1507 } 1508 dbgs() << " } else "; 1509 dbgs() << "{ "; 1510 if (FalseMBB) { 1511 dbgs() << "BB" << FalseMBB->getNumber(); 1512 } 1513 dbgs() << " }\n "; 1514 dbgs() << "landBlock: "; 1515 if (!LandMBB) { 1516 dbgs() << "NULL"; 1517 } else { 1518 dbgs() << "BB" << LandMBB->getNumber(); 1519 } 1520 dbgs() << "\n"; 1521 ); 1522 1523 int OldOpcode = BranchMI->getOpcode(); 1524 DebugLoc BranchDL = BranchMI->getDebugLoc(); 1525 1526 // transform to 1527 // if cond 1528 // trueBlk 1529 // else 1530 // falseBlk 1531 // endif 1532 // landBlk 1533 1534 MachineBasicBlock::iterator I = BranchMI; 1535 insertCondBranchBefore(I, getBranchNzeroOpcode(OldOpcode), 1536 BranchDL); 1537 1538 if (TrueMBB) { 1539 MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end()); 1540 MBB->removeSuccessor(TrueMBB); 1541 if (LandMBB && TrueMBB->succ_size()!=0) 1542 TrueMBB->removeSuccessor(LandMBB); 1543 retireBlock(TrueMBB); 1544 MLI->removeBlock(TrueMBB); 1545 } 1546 1547 if (FalseMBB) { 1548 insertInstrBefore(I, AMDGPU::ELSE); 1549 MBB->splice(I, FalseMBB, FalseMBB->begin(), 1550 FalseMBB->end()); 1551 MBB->removeSuccessor(FalseMBB); 1552 if (LandMBB && FalseMBB->succ_size() != 0) 1553 FalseMBB->removeSuccessor(LandMBB); 1554 retireBlock(FalseMBB); 1555 MLI->removeBlock(FalseMBB); 1556 } 1557 insertInstrBefore(I, AMDGPU::ENDIF); 1558 1559 BranchMI->eraseFromParent(); 1560 1561 if (LandMBB && TrueMBB && FalseMBB) 1562 MBB->addSuccessor(LandMBB); 1563 1564 } 1565 1566 void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk, 1567 MachineBasicBlock *LandMBB) { 1568 DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber() 1569 << " land = BB" << LandMBB->getNumber() << "\n";); 1570 1571 insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc()); 1572 insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc()); 1573 DstBlk->addSuccessor(LandMBB); 1574 DstBlk->removeSuccessor(DstBlk); 1575 } 1576 1577 1578 void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, 1579 MachineBasicBlock *LandMBB) { 1580 DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber() 1581 << " land = BB" << LandMBB->getNumber() << "\n";); 1582 MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB); 1583 assert(BranchMI && isCondBranch(BranchMI)); 1584 DebugLoc DL = BranchMI->getDebugLoc(); 1585 MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI); 1586 MachineBasicBlock::iterator I = BranchMI; 1587 if (TrueBranch != LandMBB) 1588 reversePredicateSetter(I); 1589 insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL); 1590 insertInstrBefore(I, AMDGPU::BREAK); 1591 insertInstrBefore(I, AMDGPU::ENDIF); 1592 //now branchInst can be erase safely 1593 BranchMI->eraseFromParent(); 1594 //now take care of successors, retire blocks 1595 ExitingMBB->removeSuccessor(LandMBB); 1596 } 1597 1598 void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB, 1599 MachineBasicBlock *ContMBB) { 1600 DEBUG(dbgs() << "settleLoopcontBlock conting = BB" 1601 << ContingMBB->getNumber() 1602 << ", cont = BB" << ContMBB->getNumber() << "\n";); 1603 1604 MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB); 1605 if (MI) { 1606 assert(isCondBranch(MI)); 1607 MachineBasicBlock::iterator I = MI; 1608 MachineBasicBlock *TrueBranch = getTrueBranch(MI); 1609 int OldOpcode = MI->getOpcode(); 1610 DebugLoc DL = MI->getDebugLoc(); 1611 1612 bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI); 1613 1614 if (!UseContinueLogical) { 1615 int BranchOpcode = 1616 TrueBranch == ContMBB ? getBranchNzeroOpcode(OldOpcode) : 1617 getBranchZeroOpcode(OldOpcode); 1618 insertCondBranchBefore(I, BranchOpcode, DL); 1619 // insertEnd to ensure phi-moves, if exist, go before the continue-instr. 1620 insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL); 1621 insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL); 1622 } else { 1623 int BranchOpcode = 1624 TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) : 1625 getContinueZeroOpcode(OldOpcode); 1626 insertCondBranchBefore(I, BranchOpcode, DL); 1627 } 1628 1629 MI->eraseFromParent(); 1630 } else { 1631 // if we've arrived here then we've already erased the branch instruction 1632 // travel back up the basic block to see the last reference of our debug 1633 // location we've just inserted that reference here so it should be 1634 // representative insertEnd to ensure phi-moves, if exist, go before the 1635 // continue-instr. 1636 insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, 1637 getLastDebugLocInBB(ContingMBB)); 1638 } 1639 } 1640 1641 int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB, 1642 MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) { 1643 int Cloned = 0; 1644 assert(PreMBB->isSuccessor(SrcMBB)); 1645 while (SrcMBB && SrcMBB != DstMBB) { 1646 assert(SrcMBB->succ_size() == 1); 1647 if (SrcMBB->pred_size() > 1) { 1648 SrcMBB = cloneBlockForPredecessor(SrcMBB, PreMBB); 1649 ++Cloned; 1650 } 1651 1652 PreMBB = SrcMBB; 1653 SrcMBB = *SrcMBB->succ_begin(); 1654 } 1655 1656 return Cloned; 1657 } 1658 1659 MachineBasicBlock * 1660 AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB, 1661 MachineBasicBlock *PredMBB) { 1662 assert(PredMBB->isSuccessor(MBB) && 1663 "succBlk is not a prececessor of curBlk"); 1664 1665 MachineBasicBlock *CloneMBB = clone(MBB); //clone instructions 1666 replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB); 1667 //srcBlk, oldBlk, newBlk 1668 1669 PredMBB->removeSuccessor(MBB); 1670 PredMBB->addSuccessor(CloneMBB); 1671 1672 // add all successor to cloneBlk 1673 cloneSuccessorList(CloneMBB, MBB); 1674 1675 numClonedInstr += MBB->size(); 1676 1677 DEBUG( 1678 dbgs() << "Cloned block: " << "BB" 1679 << MBB->getNumber() << "size " << MBB->size() << "\n"; 1680 ); 1681 1682 SHOWNEWBLK(CloneMBB, "result of Cloned block: "); 1683 1684 return CloneMBB; 1685 } 1686 1687 void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB, 1688 MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) { 1689 MachineBasicBlock::iterator SpliceEnd; 1690 //look for the input branchinstr, not the AMDGPU branchinstr 1691 MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB); 1692 if (!BranchMI) { 1693 DEBUG( 1694 dbgs() << "migrateInstruction don't see branch instr\n" ; 1695 ); 1696 SpliceEnd = SrcMBB->end(); 1697 } else { 1698 DEBUG( 1699 dbgs() << "migrateInstruction see branch instr\n" ; 1700 BranchMI->dump(); 1701 ); 1702 SpliceEnd = BranchMI; 1703 } 1704 DEBUG( 1705 dbgs() << "migrateInstruction before splice dstSize = " << DstMBB->size() 1706 << "srcSize = " << SrcMBB->size() << "\n"; 1707 ); 1708 1709 //splice insert before insertPos 1710 DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd); 1711 1712 DEBUG( 1713 dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size() 1714 << "srcSize = " << SrcMBB->size() << "\n"; 1715 ); 1716 } 1717 1718 MachineBasicBlock * 1719 AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) { 1720 MachineBasicBlock *LoopHeader = LoopRep->getHeader(); 1721 MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch(); 1722 const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); 1723 1724 if (!LoopHeader || !LoopLatch) 1725 return nullptr; 1726 MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch); 1727 // Is LoopRep an infinite loop ? 1728 if (!BranchMI || !isUncondBranch(BranchMI)) 1729 return nullptr; 1730 1731 MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock(); 1732 FuncRep->push_back(DummyExitBlk); //insert to function 1733 SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: "); 1734 DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";); 1735 MachineBasicBlock::iterator I = BranchMI; 1736 unsigned ImmReg = FuncRep->getRegInfo().createVirtualRegister(I32RC); 1737 llvm_unreachable("Extra register needed to handle CFG"); 1738 MachineInstr *NewMI = insertInstrBefore(I, AMDGPU::BRANCH_COND_i32); 1739 MachineInstrBuilder MIB(*FuncRep, NewMI); 1740 MIB.addMBB(LoopHeader); 1741 MIB.addReg(ImmReg, false); 1742 SHOWNEWINSTR(NewMI); 1743 BranchMI->eraseFromParent(); 1744 LoopLatch->addSuccessor(DummyExitBlk); 1745 1746 return DummyExitBlk; 1747 } 1748 1749 void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) { 1750 MachineInstr *BranchMI; 1751 1752 // I saw two unconditional branch in one basic block in example 1753 // test_fc_do_while_or.c need to fix the upstream on this to remove the loop. 1754 while ((BranchMI = getLoopendBlockBranchInstr(MBB)) 1755 && isUncondBranch(BranchMI)) { 1756 DEBUG(dbgs() << "Removing uncond branch instr"; BranchMI->dump();); 1757 BranchMI->eraseFromParent(); 1758 } 1759 } 1760 1761 void AMDGPUCFGStructurizer::removeRedundantConditionalBranch( 1762 MachineBasicBlock *MBB) { 1763 if (MBB->succ_size() != 2) 1764 return; 1765 MachineBasicBlock *MBB1 = *MBB->succ_begin(); 1766 MachineBasicBlock *MBB2 = *std::next(MBB->succ_begin()); 1767 if (MBB1 != MBB2) 1768 return; 1769 1770 MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB); 1771 assert(BranchMI && isCondBranch(BranchMI)); 1772 DEBUG(dbgs() << "Removing unneeded cond branch instr"; BranchMI->dump();); 1773 BranchMI->eraseFromParent(); 1774 SHOWNEWBLK(MBB1, "Removing redundant successor"); 1775 MBB->removeSuccessor(MBB1); 1776 } 1777 1778 void AMDGPUCFGStructurizer::addDummyExitBlock( 1779 SmallVectorImpl<MachineBasicBlock*> &RetMBB) { 1780 MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock(); 1781 FuncRep->push_back(DummyExitBlk); //insert to function 1782 insertInstrEnd(DummyExitBlk, AMDGPU::RETURN); 1783 1784 for (SmallVectorImpl<MachineBasicBlock *>::iterator It = RetMBB.begin(), 1785 E = RetMBB.end(); It != E; ++It) { 1786 MachineBasicBlock *MBB = *It; 1787 MachineInstr *MI = getReturnInstr(MBB); 1788 if (MI) 1789 MI->eraseFromParent(); 1790 MBB->addSuccessor(DummyExitBlk); 1791 DEBUG( 1792 dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber() 1793 << " successors\n"; 1794 ); 1795 } 1796 SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: "); 1797 } 1798 1799 void AMDGPUCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) { 1800 while (MBB->succ_size()) 1801 MBB->removeSuccessor(*MBB->succ_begin()); 1802 } 1803 1804 void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB, 1805 int SccNum) { 1806 BlockInformation *&srcBlkInfo = BlockInfoMap[MBB]; 1807 if (!srcBlkInfo) 1808 srcBlkInfo = new BlockInformation(); 1809 srcBlkInfo->SccNum = SccNum; 1810 } 1811 1812 void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) { 1813 DEBUG( 1814 dbgs() << "Retiring BB" << MBB->getNumber() << "\n"; 1815 ); 1816 1817 BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB]; 1818 1819 if (!SrcBlkInfo) 1820 SrcBlkInfo = new BlockInformation(); 1821 1822 SrcBlkInfo->IsRetired = true; 1823 assert(MBB->succ_size() == 0 && MBB->pred_size() == 0 1824 && "can't retire block yet"); 1825 } 1826 1827 void AMDGPUCFGStructurizer::setLoopLandBlock(MachineLoop *loopRep, 1828 MachineBasicBlock *MBB) { 1829 MachineBasicBlock *&TheEntry = LLInfoMap[loopRep]; 1830 if (!MBB) { 1831 MBB = FuncRep->CreateMachineBasicBlock(); 1832 FuncRep->push_back(MBB); //insert to function 1833 SHOWNEWBLK(MBB, "DummyLandingBlock for loop without break: "); 1834 } 1835 TheEntry = MBB; 1836 DEBUG( 1837 dbgs() << "setLoopLandBlock loop-header = BB" 1838 << loopRep->getHeader()->getNumber() 1839 << " landing-block = BB" << MBB->getNumber() << "\n"; 1840 ); 1841 } 1842 1843 MachineBasicBlock * 1844 AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1, 1845 MachineBasicBlock *MBB2) { 1846 1847 if (PDT->dominates(MBB1, MBB2)) 1848 return MBB1; 1849 if (PDT->dominates(MBB2, MBB1)) 1850 return MBB2; 1851 1852 MachineDomTreeNode *Node1 = PDT->getNode(MBB1); 1853 MachineDomTreeNode *Node2 = PDT->getNode(MBB2); 1854 1855 // Handle newly cloned node. 1856 if (!Node1 && MBB1->succ_size() == 1) 1857 return findNearestCommonPostDom(*MBB1->succ_begin(), MBB2); 1858 if (!Node2 && MBB2->succ_size() == 1) 1859 return findNearestCommonPostDom(MBB1, *MBB2->succ_begin()); 1860 1861 if (!Node1 || !Node2) 1862 return nullptr; 1863 1864 Node1 = Node1->getIDom(); 1865 while (Node1) { 1866 if (PDT->dominates(Node1, Node2)) 1867 return Node1->getBlock(); 1868 Node1 = Node1->getIDom(); 1869 } 1870 1871 return nullptr; 1872 } 1873 1874 MachineBasicBlock * 1875 AMDGPUCFGStructurizer::findNearestCommonPostDom( 1876 std::set<MachineBasicBlock *> &MBBs) { 1877 MachineBasicBlock *CommonDom; 1878 std::set<MachineBasicBlock *>::const_iterator It = MBBs.begin(); 1879 std::set<MachineBasicBlock *>::const_iterator E = MBBs.end(); 1880 for (CommonDom = *It; It != E && CommonDom; ++It) { 1881 MachineBasicBlock *MBB = *It; 1882 if (MBB != CommonDom) 1883 CommonDom = findNearestCommonPostDom(MBB, CommonDom); 1884 } 1885 1886 DEBUG( 1887 dbgs() << "Common post dominator for exit blocks is "; 1888 if (CommonDom) 1889 dbgs() << "BB" << CommonDom->getNumber() << "\n"; 1890 else 1891 dbgs() << "NULL\n"; 1892 ); 1893 1894 return CommonDom; 1895 } 1896 1897 char AMDGPUCFGStructurizer::ID = 0; 1898 1899 } // end anonymous namespace 1900 1901 1902 INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer", 1903 "AMDGPU CFG Structurizer", false, false) 1904 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) 1905 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) 1906 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) 1907 INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer", 1908 "AMDGPU CFG Structurizer", false, false) 1909 1910 FunctionPass *llvm::createAMDGPUCFGStructurizerPass() { 1911 return new AMDGPUCFGStructurizer(); 1912 } 1913