Home | History | Annotate | Download | only in AMDGPU
      1 //===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 /// \file
      9 //==-----------------------------------------------------------------------===//
     10 
     11 #include "AMDGPU.h"
     12 #include "AMDGPUInstrInfo.h"
     13 #include "AMDGPUSubtarget.h"
     14 #include "R600InstrInfo.h"
     15 #include "llvm/ADT/DepthFirstIterator.h"
     16 #include "llvm/ADT/SCCIterator.h"
     17 #include "llvm/ADT/SmallVector.h"
     18 #include "llvm/ADT/Statistic.h"
     19 #include "llvm/CodeGen/MachineDominators.h"
     20 #include "llvm/CodeGen/MachineFunction.h"
     21 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
     22 #include "llvm/CodeGen/MachineFunctionPass.h"
     23 #include "llvm/CodeGen/MachineInstrBuilder.h"
     24 #include "llvm/CodeGen/MachineJumpTableInfo.h"
     25 #include "llvm/CodeGen/MachineLoopInfo.h"
     26 #include "llvm/CodeGen/MachinePostDominators.h"
     27 #include "llvm/CodeGen/MachineRegisterInfo.h"
     28 #include "llvm/IR/Dominators.h"
     29 #include "llvm/Support/Debug.h"
     30 #include "llvm/Support/raw_ostream.h"
     31 #include "llvm/Target/TargetInstrInfo.h"
     32 #include "llvm/Target/TargetMachine.h"
     33 #include <deque>
     34 
     35 using namespace llvm;
     36 
     37 #define DEBUG_TYPE "structcfg"
     38 
     39 #define DEFAULT_VEC_SLOTS 8
     40 
     41 // TODO: move-begin.
     42 
     43 //===----------------------------------------------------------------------===//
     44 //
     45 // Statistics for CFGStructurizer.
     46 //
     47 //===----------------------------------------------------------------------===//
     48 
     49 STATISTIC(numSerialPatternMatch,    "CFGStructurizer number of serial pattern "
     50     "matched");
     51 STATISTIC(numIfPatternMatch,        "CFGStructurizer number of if pattern "
     52     "matched");
     53 STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
     54 STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
     55 
     56 namespace llvm {
     57   void initializeAMDGPUCFGStructurizerPass(PassRegistry&);
     58 }
     59 
     60 //===----------------------------------------------------------------------===//
     61 //
     62 // Miscellaneous utility for CFGStructurizer.
     63 //
     64 //===----------------------------------------------------------------------===//
     65 namespace {
     66 #define SHOWNEWINSTR(i) \
     67   DEBUG(dbgs() << "New instr: " << *i << "\n");
     68 
     69 #define SHOWNEWBLK(b, msg) \
     70 DEBUG( \
     71   dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
     72   dbgs() << "\n"; \
     73 );
     74 
     75 #define SHOWBLK_DETAIL(b, msg) \
     76 DEBUG( \
     77   if (b) { \
     78   dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
     79   b->print(dbgs()); \
     80   dbgs() << "\n"; \
     81   } \
     82 );
     83 
     84 #define INVALIDSCCNUM -1
     85 
     86 template<class NodeT>
     87 void ReverseVector(SmallVectorImpl<NodeT *> &Src) {
     88   size_t sz = Src.size();
     89   for (size_t i = 0; i < sz/2; ++i) {
     90     NodeT *t = Src[i];
     91     Src[i] = Src[sz - i - 1];
     92     Src[sz - i - 1] = t;
     93   }
     94 }
     95 
     96 } // end anonymous namespace
     97 
     98 //===----------------------------------------------------------------------===//
     99 //
    100 // supporting data structure for CFGStructurizer
    101 //
    102 //===----------------------------------------------------------------------===//
    103 
    104 
    105 namespace {
    106 
    107 class BlockInformation {
    108 public:
    109   bool IsRetired;
    110   int  SccNum;
    111   BlockInformation() : IsRetired(false), SccNum(INVALIDSCCNUM) {}
    112 };
    113 
    114 } // end anonymous namespace
    115 
    116 //===----------------------------------------------------------------------===//
    117 //
    118 // CFGStructurizer
    119 //
    120 //===----------------------------------------------------------------------===//
    121 
    122 namespace {
    123 class AMDGPUCFGStructurizer : public MachineFunctionPass {
    124 public:
    125   typedef SmallVector<MachineBasicBlock *, 32> MBBVector;
    126   typedef std::map<MachineBasicBlock *, BlockInformation *> MBBInfoMap;
    127   typedef std::map<MachineLoop *, MachineBasicBlock *> LoopLandInfoMap;
    128 
    129   enum PathToKind {
    130     Not_SinglePath = 0,
    131     SinglePath_InPath = 1,
    132     SinglePath_NotInPath = 2
    133   };
    134 
    135   static char ID;
    136 
    137   AMDGPUCFGStructurizer() :
    138       MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) {
    139     initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
    140   }
    141 
    142    const char *getPassName() const override {
    143     return "AMDGPU Control Flow Graph structurizer Pass";
    144   }
    145 
    146   void getAnalysisUsage(AnalysisUsage &AU) const override {
    147     AU.addPreserved<MachineFunctionAnalysis>();
    148     AU.addRequired<MachineFunctionAnalysis>();
    149     AU.addRequired<MachineDominatorTree>();
    150     AU.addRequired<MachinePostDominatorTree>();
    151     AU.addRequired<MachineLoopInfo>();
    152   }
    153 
    154   /// Perform the CFG structurization
    155   bool run();
    156 
    157   /// Perform the CFG preparation
    158   /// This step will remove every unconditionnal/dead jump instructions and make
    159   /// sure all loops have an exit block
    160   bool prepare();
    161 
    162   bool runOnMachineFunction(MachineFunction &MF) override {
    163     TII = MF.getSubtarget<R600Subtarget>().getInstrInfo();
    164     TRI = &TII->getRegisterInfo();
    165     DEBUG(MF.dump(););
    166     OrderedBlks.clear();
    167     Visited.clear();
    168     FuncRep = &MF;
    169     MLI = &getAnalysis<MachineLoopInfo>();
    170     DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
    171     MDT = &getAnalysis<MachineDominatorTree>();
    172     DEBUG(MDT->print(dbgs(), (const llvm::Module*)nullptr););
    173     PDT = &getAnalysis<MachinePostDominatorTree>();
    174     DEBUG(PDT->print(dbgs()););
    175     prepare();
    176     run();
    177     DEBUG(MF.dump(););
    178     return true;
    179   }
    180 
    181 protected:
    182   MachineDominatorTree *MDT;
    183   MachinePostDominatorTree *PDT;
    184   MachineLoopInfo *MLI;
    185   const R600InstrInfo *TII;
    186   const R600RegisterInfo *TRI;
    187 
    188   // PRINT FUNCTIONS
    189   /// Print the ordered Blocks.
    190   void printOrderedBlocks() const {
    191     size_t i = 0;
    192     for (MBBVector::const_iterator iterBlk = OrderedBlks.begin(),
    193         iterBlkEnd = OrderedBlks.end(); iterBlk != iterBlkEnd; ++iterBlk, ++i) {
    194       dbgs() << "BB" << (*iterBlk)->getNumber();
    195       dbgs() << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
    196       if (i != 0 && i % 10 == 0) {
    197         dbgs() << "\n";
    198       } else {
    199         dbgs() << " ";
    200       }
    201     }
    202   }
    203   static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) {
    204     for (MachineLoop::iterator iter = LoopInfo.begin(),
    205          iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) {
    206       (*iter)->print(dbgs(), 0);
    207     }
    208   }
    209 
    210   // UTILITY FUNCTIONS
    211   int getSCCNum(MachineBasicBlock *MBB) const;
    212   MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const;
    213   bool hasBackEdge(MachineBasicBlock *MBB) const;
    214   bool isRetiredBlock(MachineBasicBlock *MBB) const;
    215   bool isActiveLoophead(MachineBasicBlock *MBB) const;
    216   PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
    217       bool AllowSideEntry = true) const;
    218   int countActiveBlock(MBBVector::const_iterator It,
    219       MBBVector::const_iterator E) const;
    220   bool needMigrateBlock(MachineBasicBlock *MBB) const;
    221 
    222   // Utility Functions
    223   void reversePredicateSetter(MachineBasicBlock::iterator I);
    224   /// Compute the reversed DFS post order of Blocks
    225   void orderBlocks(MachineFunction *MF);
    226 
    227   // Function originally from CFGStructTraits
    228   void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode,
    229                       const DebugLoc &DL = DebugLoc());
    230   MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode,
    231                                   const DebugLoc &DL = DebugLoc());
    232   MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode);
    233   void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode,
    234                               const DebugLoc &DL);
    235   void insertCondBranchBefore(MachineBasicBlock *MBB,
    236                               MachineBasicBlock::iterator I, int NewOpcode,
    237                               int RegNum, const DebugLoc &DL);
    238   static int getBranchNzeroOpcode(int OldOpcode);
    239   static int getBranchZeroOpcode(int OldOpcode);
    240   static int getContinueNzeroOpcode(int OldOpcode);
    241   static int getContinueZeroOpcode(int OldOpcode);
    242   static MachineBasicBlock *getTrueBranch(MachineInstr *MI);
    243   static void setTrueBranch(MachineInstr *MI, MachineBasicBlock *MBB);
    244   static MachineBasicBlock *getFalseBranch(MachineBasicBlock *MBB,
    245       MachineInstr *MI);
    246   static bool isCondBranch(MachineInstr *MI);
    247   static bool isUncondBranch(MachineInstr *MI);
    248   static DebugLoc getLastDebugLocInBB(MachineBasicBlock *MBB);
    249   static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *MBB);
    250   /// The correct naming for this is getPossibleLoopendBlockBranchInstr.
    251   ///
    252   /// BB with backward-edge could have move instructions after the branch
    253   /// instruction.  Such move instruction "belong to" the loop backward-edge.
    254   MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB);
    255   static MachineInstr *getReturnInstr(MachineBasicBlock *MBB);
    256   static bool isReturnBlock(MachineBasicBlock *MBB);
    257   static void cloneSuccessorList(MachineBasicBlock *DstMBB,
    258       MachineBasicBlock *SrcMBB) ;
    259   static MachineBasicBlock *clone(MachineBasicBlock *MBB);
    260   /// MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose
    261   /// because the AMDGPU instruction is not recognized as terminator fix this
    262   /// and retire this routine
    263   void replaceInstrUseOfBlockWith(MachineBasicBlock *SrcMBB,
    264       MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk);
    265   static void wrapup(MachineBasicBlock *MBB);
    266 
    267 
    268   int patternMatch(MachineBasicBlock *MBB);
    269   int patternMatchGroup(MachineBasicBlock *MBB);
    270   int serialPatternMatch(MachineBasicBlock *MBB);
    271   int ifPatternMatch(MachineBasicBlock *MBB);
    272   int loopendPatternMatch();
    273   int mergeLoop(MachineLoop *LoopRep);
    274 
    275   /// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in
    276   /// the same loop with LoopLandInfo without explicitly keeping track of
    277   /// loopContBlks and loopBreakBlks, this is a method to get the information.
    278   bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB,
    279       MachineBasicBlock *Src2MBB);
    280   int handleJumpintoIf(MachineBasicBlock *HeadMBB,
    281       MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
    282   int handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
    283       MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
    284   int improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
    285       MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
    286       MachineBasicBlock **LandMBBPtr);
    287   void showImproveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
    288       MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
    289       MachineBasicBlock *LandMBB, bool Detail = false);
    290   int cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
    291       MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB);
    292   void mergeSerialBlock(MachineBasicBlock *DstMBB,
    293       MachineBasicBlock *SrcMBB);
    294 
    295   void mergeIfthenelseBlock(MachineInstr *BranchMI,
    296       MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
    297       MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB);
    298   void mergeLooplandBlock(MachineBasicBlock *DstMBB,
    299       MachineBasicBlock *LandMBB);
    300   void mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
    301       MachineBasicBlock *LandMBB);
    302   void settleLoopcontBlock(MachineBasicBlock *ContingMBB,
    303       MachineBasicBlock *ContMBB);
    304   /// normalizeInfiniteLoopExit change
    305   ///   B1:
    306   ///        uncond_br LoopHeader
    307   ///
    308   /// to
    309   ///   B1:
    310   ///        cond_br 1 LoopHeader dummyExit
    311   /// and return the newly added dummy exit block
    312   MachineBasicBlock *normalizeInfiniteLoopExit(MachineLoop *LoopRep);
    313   void removeUnconditionalBranch(MachineBasicBlock *MBB);
    314   /// Remove duplicate branches instructions in a block.
    315   /// For instance
    316   /// B0:
    317   ///    cond_br X B1 B2
    318   ///    cond_br X B1 B2
    319   /// is transformed to
    320   /// B0:
    321   ///    cond_br X B1 B2
    322   void removeRedundantConditionalBranch(MachineBasicBlock *MBB);
    323   void addDummyExitBlock(SmallVectorImpl<MachineBasicBlock *> &RetMBB);
    324   void removeSuccessor(MachineBasicBlock *MBB);
    325   MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB,
    326       MachineBasicBlock *PredMBB);
    327   void migrateInstruction(MachineBasicBlock *SrcMBB,
    328       MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
    329   void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
    330   void retireBlock(MachineBasicBlock *MBB);
    331 
    332 
    333 private:
    334   MBBInfoMap BlockInfoMap;
    335   LoopLandInfoMap LLInfoMap;
    336   std::map<MachineLoop *, bool> Visited;
    337   MachineFunction *FuncRep;
    338   SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks;
    339 };
    340 
    341 int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const {
    342   MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
    343   if (It == BlockInfoMap.end())
    344     return INVALIDSCCNUM;
    345   return (*It).second->SccNum;
    346 }
    347 
    348 MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
    349     const {
    350   LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep);
    351   if (It == LLInfoMap.end())
    352     return nullptr;
    353   return (*It).second;
    354 }
    355 
    356 bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const {
    357   MachineLoop *LoopRep = MLI->getLoopFor(MBB);
    358   if (!LoopRep)
    359     return false;
    360   MachineBasicBlock *LoopHeader = LoopRep->getHeader();
    361   return MBB->isSuccessor(LoopHeader);
    362 }
    363 
    364 bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const {
    365   MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
    366   if (It == BlockInfoMap.end())
    367     return false;
    368   return (*It).second->IsRetired;
    369 }
    370 
    371 bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const {
    372   MachineLoop *LoopRep = MLI->getLoopFor(MBB);
    373   while (LoopRep && LoopRep->getHeader() == MBB) {
    374     MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep);
    375     if(!LoopLand)
    376       return true;
    377     if (!isRetiredBlock(LoopLand))
    378       return true;
    379     LoopRep = LoopRep->getParentLoop();
    380   }
    381   return false;
    382 }
    383 AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo(
    384     MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
    385     bool AllowSideEntry) const {
    386   assert(DstMBB);
    387   if (SrcMBB == DstMBB)
    388     return SinglePath_InPath;
    389   while (SrcMBB && SrcMBB->succ_size() == 1) {
    390     SrcMBB = *SrcMBB->succ_begin();
    391     if (SrcMBB == DstMBB)
    392       return SinglePath_InPath;
    393     if (!AllowSideEntry && SrcMBB->pred_size() > 1)
    394       return Not_SinglePath;
    395   }
    396   if (SrcMBB && SrcMBB->succ_size()==0)
    397     return SinglePath_NotInPath;
    398   return Not_SinglePath;
    399 }
    400 
    401 int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It,
    402     MBBVector::const_iterator E) const {
    403   int Count = 0;
    404   while (It != E) {
    405     if (!isRetiredBlock(*It))
    406       ++Count;
    407     ++It;
    408   }
    409   return Count;
    410 }
    411 
    412 bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
    413   unsigned BlockSizeThreshold = 30;
    414   unsigned CloneInstrThreshold = 100;
    415   bool MultiplePreds = MBB && (MBB->pred_size() > 1);
    416 
    417   if(!MultiplePreds)
    418     return false;
    419   unsigned BlkSize = MBB->size();
    420   return ((BlkSize > BlockSizeThreshold) &&
    421       (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold));
    422 }
    423 
    424 void AMDGPUCFGStructurizer::reversePredicateSetter(
    425     MachineBasicBlock::iterator I) {
    426   assert(static_cast<MachineInstr *>(I) && "Expected valid iterator");
    427   for (;; --I) {
    428     if (I->getOpcode() == AMDGPU::PRED_X) {
    429       switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) {
    430       case OPCODE_IS_ZERO_INT:
    431         static_cast<MachineInstr *>(I)->getOperand(2)
    432             .setImm(OPCODE_IS_NOT_ZERO_INT);
    433         return;
    434       case OPCODE_IS_NOT_ZERO_INT:
    435         static_cast<MachineInstr *>(I)->getOperand(2)
    436             .setImm(OPCODE_IS_ZERO_INT);
    437         return;
    438       case OPCODE_IS_ZERO:
    439         static_cast<MachineInstr *>(I)->getOperand(2)
    440             .setImm(OPCODE_IS_NOT_ZERO);
    441         return;
    442       case OPCODE_IS_NOT_ZERO:
    443         static_cast<MachineInstr *>(I)->getOperand(2)
    444             .setImm(OPCODE_IS_ZERO);
    445         return;
    446       default:
    447         llvm_unreachable("PRED_X Opcode invalid!");
    448       }
    449     }
    450   }
    451 }
    452 
    453 void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB,
    454                                            int NewOpcode, const DebugLoc &DL) {
    455   MachineInstr *MI =
    456       MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
    457   MBB->push_back(MI);
    458   //assume the instruction doesn't take any reg operand ...
    459   SHOWNEWINSTR(MI);
    460 }
    461 
    462 MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
    463                                                        int NewOpcode,
    464                                                        const DebugLoc &DL) {
    465   MachineInstr *MI =
    466       MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
    467   if (MBB->begin() != MBB->end())
    468     MBB->insert(MBB->begin(), MI);
    469   else
    470     MBB->push_back(MI);
    471   SHOWNEWINSTR(MI);
    472   return MI;
    473 }
    474 
    475 MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(
    476     MachineBasicBlock::iterator I, int NewOpcode) {
    477   MachineInstr *OldMI = &(*I);
    478   MachineBasicBlock *MBB = OldMI->getParent();
    479   MachineInstr *NewMBB =
    480       MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
    481   MBB->insert(I, NewMBB);
    482   //assume the instruction doesn't take any reg operand ...
    483   SHOWNEWINSTR(NewMBB);
    484   return NewMBB;
    485 }
    486 
    487 void AMDGPUCFGStructurizer::insertCondBranchBefore(
    488     MachineBasicBlock::iterator I, int NewOpcode, const DebugLoc &DL) {
    489   MachineInstr *OldMI = &(*I);
    490   MachineBasicBlock *MBB = OldMI->getParent();
    491   MachineFunction *MF = MBB->getParent();
    492   MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
    493   MBB->insert(I, NewMI);
    494   MachineInstrBuilder MIB(*MF, NewMI);
    495   MIB.addReg(OldMI->getOperand(1).getReg(), false);
    496   SHOWNEWINSTR(NewMI);
    497   //erase later oldInstr->eraseFromParent();
    498 }
    499 
    500 void AMDGPUCFGStructurizer::insertCondBranchBefore(
    501     MachineBasicBlock *blk, MachineBasicBlock::iterator I, int NewOpcode,
    502     int RegNum, const DebugLoc &DL) {
    503   MachineFunction *MF = blk->getParent();
    504   MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
    505   //insert before
    506   blk->insert(I, NewInstr);
    507   MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
    508   SHOWNEWINSTR(NewInstr);
    509 }
    510 
    511 int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
    512   switch(OldOpcode) {
    513   case AMDGPU::JUMP_COND:
    514   case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
    515   case AMDGPU::BRANCH_COND_i32:
    516   case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
    517   default: llvm_unreachable("internal error");
    518   }
    519   return -1;
    520 }
    521 
    522 int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
    523   switch(OldOpcode) {
    524   case AMDGPU::JUMP_COND:
    525   case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
    526   case AMDGPU::BRANCH_COND_i32:
    527   case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
    528   default: llvm_unreachable("internal error");
    529   }
    530   return -1;
    531 }
    532 
    533 int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
    534   switch(OldOpcode) {
    535   case AMDGPU::JUMP_COND:
    536   case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
    537   default: llvm_unreachable("internal error");
    538   };
    539   return -1;
    540 }
    541 
    542 int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
    543   switch(OldOpcode) {
    544   case AMDGPU::JUMP_COND:
    545   case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
    546   default: llvm_unreachable("internal error");
    547   }
    548   return -1;
    549 }
    550 
    551 MachineBasicBlock *AMDGPUCFGStructurizer::getTrueBranch(MachineInstr *MI) {
    552   return MI->getOperand(0).getMBB();
    553 }
    554 
    555 void AMDGPUCFGStructurizer::setTrueBranch(MachineInstr *MI,
    556     MachineBasicBlock *MBB) {
    557   MI->getOperand(0).setMBB(MBB);
    558 }
    559 
    560 MachineBasicBlock *
    561 AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
    562     MachineInstr *MI) {
    563   assert(MBB->succ_size() == 2);
    564   MachineBasicBlock *TrueBranch = getTrueBranch(MI);
    565   MachineBasicBlock::succ_iterator It = MBB->succ_begin();
    566   MachineBasicBlock::succ_iterator Next = It;
    567   ++Next;
    568   return (*It == TrueBranch) ? *Next : *It;
    569 }
    570 
    571 bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
    572   switch (MI->getOpcode()) {
    573     case AMDGPU::JUMP_COND:
    574     case AMDGPU::BRANCH_COND_i32:
    575     case AMDGPU::BRANCH_COND_f32: return true;
    576   default:
    577     return false;
    578   }
    579   return false;
    580 }
    581 
    582 bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) {
    583   switch (MI->getOpcode()) {
    584   case AMDGPU::JUMP:
    585   case AMDGPU::BRANCH:
    586     return true;
    587   default:
    588     return false;
    589   }
    590   return false;
    591 }
    592 
    593 DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) {
    594   //get DebugLoc from the first MachineBasicBlock instruction with debug info
    595   DebugLoc DL;
    596   for (MachineBasicBlock::iterator It = MBB->begin(); It != MBB->end();
    597       ++It) {
    598     MachineInstr *instr = &(*It);
    599     if (instr->getDebugLoc())
    600       DL = instr->getDebugLoc();
    601   }
    602   return DL;
    603 }
    604 
    605 MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr(
    606     MachineBasicBlock *MBB) {
    607   MachineBasicBlock::reverse_iterator It = MBB->rbegin();
    608   MachineInstr *MI = &*It;
    609   if (MI && (isCondBranch(MI) || isUncondBranch(MI)))
    610     return MI;
    611   return nullptr;
    612 }
    613 
    614 MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
    615     MachineBasicBlock *MBB) {
    616   for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend();
    617       It != E; ++It) {
    618     // FIXME: Simplify
    619     MachineInstr *MI = &*It;
    620     if (MI) {
    621       if (isCondBranch(MI) || isUncondBranch(MI))
    622         return MI;
    623       else if (!TII->isMov(MI->getOpcode()))
    624         break;
    625     }
    626   }
    627   return nullptr;
    628 }
    629 
    630 MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
    631   MachineBasicBlock::reverse_iterator It = MBB->rbegin();
    632   if (It != MBB->rend()) {
    633     MachineInstr *instr = &(*It);
    634     if (instr->getOpcode() == AMDGPU::RETURN)
    635       return instr;
    636   }
    637   return nullptr;
    638 }
    639 
    640 bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
    641   MachineInstr *MI = getReturnInstr(MBB);
    642   bool IsReturn = (MBB->succ_size() == 0);
    643   if (MI)
    644     assert(IsReturn);
    645   else if (IsReturn)
    646     DEBUG(
    647       dbgs() << "BB" << MBB->getNumber()
    648              <<" is return block without RETURN instr\n";);
    649   return  IsReturn;
    650 }
    651 
    652 void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB,
    653     MachineBasicBlock *SrcMBB) {
    654   for (MachineBasicBlock::succ_iterator It = SrcMBB->succ_begin(),
    655        iterEnd = SrcMBB->succ_end(); It != iterEnd; ++It)
    656     DstMBB->addSuccessor(*It);  // *iter's predecessor is also taken care of
    657 }
    658 
    659 MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) {
    660   MachineFunction *Func = MBB->getParent();
    661   MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock();
    662   Func->push_back(NewMBB);  //insert to function
    663   for (const MachineInstr &It : *MBB)
    664     NewMBB->push_back(Func->CloneMachineInstr(&It));
    665   return NewMBB;
    666 }
    667 
    668 void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith(
    669     MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB,
    670     MachineBasicBlock *NewBlk) {
    671   MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB);
    672   if (BranchMI && isCondBranch(BranchMI) &&
    673       getTrueBranch(BranchMI) == OldMBB)
    674     setTrueBranch(BranchMI, NewBlk);
    675 }
    676 
    677 void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
    678   assert((!MBB->getParent()->getJumpTableInfo()
    679           || MBB->getParent()->getJumpTableInfo()->isEmpty())
    680          && "found a jump table");
    681 
    682    //collect continue right before endloop
    683    SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> ContInstr;
    684    MachineBasicBlock::iterator Pre = MBB->begin();
    685    MachineBasicBlock::iterator E = MBB->end();
    686    MachineBasicBlock::iterator It = Pre;
    687    while (It != E) {
    688      if (Pre->getOpcode() == AMDGPU::CONTINUE
    689          && It->getOpcode() == AMDGPU::ENDLOOP)
    690        ContInstr.push_back(&*Pre);
    691      Pre = It;
    692      ++It;
    693    }
    694 
    695    //delete continue right before endloop
    696    for (unsigned i = 0; i < ContInstr.size(); ++i)
    697       ContInstr[i]->eraseFromParent();
    698 
    699    // TODO to fix up jump table so later phase won't be confused.  if
    700    // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
    701    // there isn't such an interface yet.  alternatively, replace all the other
    702    // blocks in the jump table with the entryBlk //}
    703 
    704 }
    705 
    706 
    707 bool AMDGPUCFGStructurizer::prepare() {
    708   bool Changed = false;
    709 
    710   //FIXME: if not reducible flow graph, make it so ???
    711 
    712   DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);
    713 
    714   orderBlocks(FuncRep);
    715 
    716   SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> RetBlks;
    717 
    718   // Add an ExitBlk to loop that don't have one
    719   for (MachineLoopInfo::iterator It = MLI->begin(),
    720        E = MLI->end(); It != E; ++It) {
    721     MachineLoop *LoopRep = (*It);
    722     MBBVector ExitingMBBs;
    723     LoopRep->getExitingBlocks(ExitingMBBs);
    724 
    725     if (ExitingMBBs.size() == 0) {
    726       MachineBasicBlock* DummyExitBlk = normalizeInfiniteLoopExit(LoopRep);
    727       if (DummyExitBlk)
    728         RetBlks.push_back(DummyExitBlk);
    729     }
    730   }
    731 
    732   // Remove unconditional branch instr.
    733   // Add dummy exit block iff there are multiple returns.
    734   for (SmallVectorImpl<MachineBasicBlock *>::const_iterator
    735        It = OrderedBlks.begin(), E = OrderedBlks.end(); It != E; ++It) {
    736     MachineBasicBlock *MBB = *It;
    737     removeUnconditionalBranch(MBB);
    738     removeRedundantConditionalBranch(MBB);
    739     if (isReturnBlock(MBB)) {
    740       RetBlks.push_back(MBB);
    741     }
    742     assert(MBB->succ_size() <= 2);
    743   }
    744 
    745   if (RetBlks.size() >= 2) {
    746     addDummyExitBlock(RetBlks);
    747     Changed = true;
    748   }
    749 
    750   return Changed;
    751 }
    752 
    753 bool AMDGPUCFGStructurizer::run() {
    754 
    755   //Assume reducible CFG...
    756   DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
    757 
    758 #ifdef STRESSTEST
    759   //Use the worse block ordering to test the algorithm.
    760   ReverseVector(orderedBlks);
    761 #endif
    762 
    763   DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks(););
    764   int NumIter = 0;
    765   bool Finish = false;
    766   MachineBasicBlock *MBB;
    767   bool MakeProgress = false;
    768   int NumRemainedBlk = countActiveBlock(OrderedBlks.begin(),
    769                                         OrderedBlks.end());
    770 
    771   do {
    772     ++NumIter;
    773     DEBUG(
    774       dbgs() << "numIter = " << NumIter
    775              << ", numRemaintedBlk = " << NumRemainedBlk << "\n";
    776     );
    777 
    778     SmallVectorImpl<MachineBasicBlock *>::const_iterator It =
    779         OrderedBlks.begin();
    780     SmallVectorImpl<MachineBasicBlock *>::const_iterator E =
    781         OrderedBlks.end();
    782 
    783     SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter =
    784         It;
    785     MachineBasicBlock *SccBeginMBB = nullptr;
    786     int SccNumBlk = 0;  // The number of active blocks, init to a
    787                         // maximum possible number.
    788     int SccNumIter;     // Number of iteration in this SCC.
    789 
    790     while (It != E) {
    791       MBB = *It;
    792 
    793       if (!SccBeginMBB) {
    794         SccBeginIter = It;
    795         SccBeginMBB = MBB;
    796         SccNumIter = 0;
    797         SccNumBlk = NumRemainedBlk; // Init to maximum possible number.
    798         DEBUG(
    799               dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB);
    800               dbgs() << "\n";
    801         );
    802       }
    803 
    804       if (!isRetiredBlock(MBB))
    805         patternMatch(MBB);
    806 
    807       ++It;
    808 
    809       bool ContNextScc = true;
    810       if (It == E
    811           || getSCCNum(SccBeginMBB) != getSCCNum(*It)) {
    812         // Just finish one scc.
    813         ++SccNumIter;
    814         int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It);
    815         if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) {
    816           DEBUG(
    817             dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
    818                    << ", sccNumIter = " << SccNumIter;
    819             dbgs() << "doesn't make any progress\n";
    820           );
    821           ContNextScc = true;
    822         } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) {
    823           SccNumBlk = sccRemainedNumBlk;
    824           It = SccBeginIter;
    825           ContNextScc = false;
    826           DEBUG(
    827             dbgs() << "repeat processing SCC" << getSCCNum(MBB)
    828                    << "sccNumIter = " << SccNumIter << '\n';
    829           );
    830         } else {
    831           // Finish the current scc.
    832           ContNextScc = true;
    833         }
    834       } else {
    835         // Continue on next component in the current scc.
    836         ContNextScc = false;
    837       }
    838 
    839       if (ContNextScc)
    840         SccBeginMBB = nullptr;
    841     } //while, "one iteration" over the function.
    842 
    843     MachineBasicBlock *EntryMBB =
    844         &*GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
    845     if (EntryMBB->succ_size() == 0) {
    846       Finish = true;
    847       DEBUG(
    848         dbgs() << "Reduce to one block\n";
    849       );
    850     } else {
    851       int NewnumRemainedBlk
    852         = countActiveBlock(OrderedBlks.begin(), OrderedBlks.end());
    853       // consider cloned blocks ??
    854       if (NewnumRemainedBlk == 1 || NewnumRemainedBlk < NumRemainedBlk) {
    855         MakeProgress = true;
    856         NumRemainedBlk = NewnumRemainedBlk;
    857       } else {
    858         MakeProgress = false;
    859         DEBUG(
    860           dbgs() << "No progress\n";
    861         );
    862       }
    863     }
    864   } while (!Finish && MakeProgress);
    865 
    866   // Misc wrap up to maintain the consistency of the Function representation.
    867   wrapup(&*GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
    868 
    869   // Detach retired Block, release memory.
    870   for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end();
    871       It != E; ++It) {
    872     if ((*It).second && (*It).second->IsRetired) {
    873       assert(((*It).first)->getNumber() != -1);
    874       DEBUG(
    875         dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";
    876       );
    877       (*It).first->eraseFromParent();  //Remove from the parent Function.
    878     }
    879     delete (*It).second;
    880   }
    881   BlockInfoMap.clear();
    882   LLInfoMap.clear();
    883 
    884   if (!Finish) {
    885     DEBUG(FuncRep->viewCFG());
    886     report_fatal_error("IRREDUCIBLE_CFG");
    887   }
    888 
    889   return true;
    890 }
    891 
    892 
    893 
    894 void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
    895   int SccNum = 0;
    896   MachineBasicBlock *MBB;
    897   for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd();
    898        ++It, ++SccNum) {
    899     const std::vector<MachineBasicBlock *> &SccNext = *It;
    900     for (std::vector<MachineBasicBlock *>::const_iterator
    901          blockIter = SccNext.begin(), blockEnd = SccNext.end();
    902          blockIter != blockEnd; ++blockIter) {
    903       MBB = *blockIter;
    904       OrderedBlks.push_back(MBB);
    905       recordSccnum(MBB, SccNum);
    906     }
    907   }
    908 
    909   //walk through all the block in func to check for unreachable
    910   typedef GraphTraits<MachineFunction *> GTM;
    911   MachineFunction::iterator It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF);
    912   for (; It != E; ++It) {
    913     MachineBasicBlock *MBB = &(*It);
    914     SccNum = getSCCNum(MBB);
    915     if (SccNum == INVALIDSCCNUM)
    916       dbgs() << "unreachable block BB" << MBB->getNumber() << "\n";
    917   }
    918 }
    919 
    920 int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
    921   int NumMatch = 0;
    922   int CurMatch;
    923 
    924   DEBUG(
    925         dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";
    926   );
    927 
    928   while ((CurMatch = patternMatchGroup(MBB)) > 0)
    929     NumMatch += CurMatch;
    930 
    931   DEBUG(
    932         dbgs() << "End patternMatch BB" << MBB->getNumber()
    933       << ", numMatch = " << NumMatch << "\n";
    934   );
    935 
    936   return NumMatch;
    937 }
    938 
    939 int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) {
    940   int NumMatch = 0;
    941   NumMatch += loopendPatternMatch();
    942   NumMatch += serialPatternMatch(MBB);
    943   NumMatch += ifPatternMatch(MBB);
    944   return NumMatch;
    945 }
    946 
    947 
    948 int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) {
    949   if (MBB->succ_size() != 1)
    950     return 0;
    951 
    952   MachineBasicBlock *childBlk = *MBB->succ_begin();
    953   if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk))
    954     return 0;
    955 
    956   mergeSerialBlock(MBB, childBlk);
    957   ++numSerialPatternMatch;
    958   return 1;
    959 }
    960 
    961 int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
    962   //two edges
    963   if (MBB->succ_size() != 2)
    964     return 0;
    965   if (hasBackEdge(MBB))
    966     return 0;
    967   MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
    968   if (!BranchMI)
    969     return 0;
    970 
    971   assert(isCondBranch(BranchMI));
    972   int NumMatch = 0;
    973 
    974   MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI);
    975   NumMatch += serialPatternMatch(TrueMBB);
    976   NumMatch += ifPatternMatch(TrueMBB);
    977   MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI);
    978   NumMatch += serialPatternMatch(FalseMBB);
    979   NumMatch += ifPatternMatch(FalseMBB);
    980   MachineBasicBlock *LandBlk;
    981   int Cloned = 0;
    982 
    983   assert (!TrueMBB->succ_empty() || !FalseMBB->succ_empty());
    984   // TODO: Simplify
    985   if (TrueMBB->succ_size() == 1 && FalseMBB->succ_size() == 1
    986     && *TrueMBB->succ_begin() == *FalseMBB->succ_begin()) {
    987     // Diamond pattern
    988     LandBlk = *TrueMBB->succ_begin();
    989   } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) {
    990     // Triangle pattern, false is empty
    991     LandBlk = FalseMBB;
    992     FalseMBB = nullptr;
    993   } else if (FalseMBB->succ_size() == 1
    994              && *FalseMBB->succ_begin() == TrueMBB) {
    995     // Triangle pattern, true is empty
    996     // We reverse the predicate to make a triangle, empty false pattern;
    997     std::swap(TrueMBB, FalseMBB);
    998     reversePredicateSetter(MBB->end());
    999     LandBlk = FalseMBB;
   1000     FalseMBB = nullptr;
   1001   } else if (FalseMBB->succ_size() == 1
   1002              && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) {
   1003     LandBlk = *FalseMBB->succ_begin();
   1004   } else if (TrueMBB->succ_size() == 1
   1005     && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) {
   1006     LandBlk = *TrueMBB->succ_begin();
   1007   } else {
   1008     return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB);
   1009   }
   1010 
   1011   // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
   1012   // new BB created for landBlk==NULL may introduce new challenge to the
   1013   // reduction process.
   1014   if (LandBlk &&
   1015       ((TrueMBB && TrueMBB->pred_size() > 1)
   1016       || (FalseMBB && FalseMBB->pred_size() > 1))) {
   1017      Cloned += improveSimpleJumpintoIf(MBB, TrueMBB, FalseMBB, &LandBlk);
   1018   }
   1019 
   1020   if (TrueMBB && TrueMBB->pred_size() > 1) {
   1021     TrueMBB = cloneBlockForPredecessor(TrueMBB, MBB);
   1022     ++Cloned;
   1023   }
   1024 
   1025   if (FalseMBB && FalseMBB->pred_size() > 1) {
   1026     FalseMBB = cloneBlockForPredecessor(FalseMBB, MBB);
   1027     ++Cloned;
   1028   }
   1029 
   1030   mergeIfthenelseBlock(BranchMI, MBB, TrueMBB, FalseMBB, LandBlk);
   1031 
   1032   ++numIfPatternMatch;
   1033 
   1034   numClonedBlock += Cloned;
   1035 
   1036   return 1 + Cloned + NumMatch;
   1037 }
   1038 
   1039 int AMDGPUCFGStructurizer::loopendPatternMatch() {
   1040   std::deque<MachineLoop *> NestedLoops;
   1041   for (auto &It: *MLI)
   1042     for (MachineLoop *ML : depth_first(It))
   1043       NestedLoops.push_front(ML);
   1044 
   1045   if (NestedLoops.size() == 0)
   1046     return 0;
   1047 
   1048   // Process nested loop outside->inside (we did push_front),
   1049   // so "continue" to a outside loop won't be mistaken as "break"
   1050   // of the current loop.
   1051   int Num = 0;
   1052   for (MachineLoop *ExaminedLoop : NestedLoops) {
   1053     if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop])
   1054       continue;
   1055     DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump(););
   1056     int NumBreak = mergeLoop(ExaminedLoop);
   1057     if (NumBreak == -1)
   1058       break;
   1059     Num += NumBreak;
   1060   }
   1061   return Num;
   1062 }
   1063 
   1064 int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
   1065   MachineBasicBlock *LoopHeader = LoopRep->getHeader();
   1066   MBBVector ExitingMBBs;
   1067   LoopRep->getExitingBlocks(ExitingMBBs);
   1068   assert(!ExitingMBBs.empty() && "Infinite Loop not supported");
   1069   DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() << " exiting blocks\n";);
   1070   // We assume a single ExitBlk
   1071   MBBVector ExitBlks;
   1072   LoopRep->getExitBlocks(ExitBlks);
   1073   SmallPtrSet<MachineBasicBlock *, 2> ExitBlkSet;
   1074   for (unsigned i = 0, e = ExitBlks.size(); i < e; ++i)
   1075     ExitBlkSet.insert(ExitBlks[i]);
   1076   assert(ExitBlkSet.size() == 1);
   1077   MachineBasicBlock *ExitBlk = *ExitBlks.begin();
   1078   assert(ExitBlk && "Loop has several exit block");
   1079   MBBVector LatchBlks;
   1080   typedef GraphTraits<Inverse<MachineBasicBlock*> > InvMBBTraits;
   1081   InvMBBTraits::ChildIteratorType PI = InvMBBTraits::child_begin(LoopHeader),
   1082       PE = InvMBBTraits::child_end(LoopHeader);
   1083   for (; PI != PE; PI++) {
   1084     if (LoopRep->contains(*PI))
   1085       LatchBlks.push_back(*PI);
   1086   }
   1087 
   1088   for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i)
   1089     mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk);
   1090   for (unsigned i = 0, e = LatchBlks.size(); i < e; ++i)
   1091     settleLoopcontBlock(LatchBlks[i], LoopHeader);
   1092   int Match = 0;
   1093   do {
   1094     Match = 0;
   1095     Match += serialPatternMatch(LoopHeader);
   1096     Match += ifPatternMatch(LoopHeader);
   1097   } while (Match > 0);
   1098   mergeLooplandBlock(LoopHeader, ExitBlk);
   1099   MachineLoop *ParentLoop = LoopRep->getParentLoop();
   1100   if (ParentLoop)
   1101     MLI->changeLoopFor(LoopHeader, ParentLoop);
   1102   else
   1103     MLI->removeBlock(LoopHeader);
   1104   Visited[LoopRep] = true;
   1105   return 1;
   1106 }
   1107 
   1108 bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
   1109     MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) {
   1110   if (Src1MBB->succ_size() == 0) {
   1111     MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB);
   1112     if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) {
   1113       MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep];
   1114       if (TheEntry) {
   1115         DEBUG(
   1116           dbgs() << "isLoopContBreakBlock yes src1 = BB"
   1117                  << Src1MBB->getNumber()
   1118                  << " src2 = BB" << Src2MBB->getNumber() << "\n";
   1119         );
   1120         return true;
   1121       }
   1122     }
   1123   }
   1124   return false;
   1125 }
   1126 
   1127 int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
   1128     MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
   1129   int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB);
   1130   if (Num == 0) {
   1131     DEBUG(
   1132       dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
   1133     );
   1134     Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB);
   1135   }
   1136   return Num;
   1137 }
   1138 
   1139 int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
   1140     MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
   1141   int Num = 0;
   1142   MachineBasicBlock *DownBlk;
   1143 
   1144   //trueBlk could be the common post dominator
   1145   DownBlk = TrueMBB;
   1146 
   1147   DEBUG(
   1148     dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber()
   1149            << " true = BB" << TrueMBB->getNumber()
   1150            << ", numSucc=" << TrueMBB->succ_size()
   1151            << " false = BB" << FalseMBB->getNumber() << "\n";
   1152   );
   1153 
   1154   while (DownBlk) {
   1155     DEBUG(
   1156       dbgs() << "check down = BB" << DownBlk->getNumber();
   1157     );
   1158 
   1159     if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) {
   1160       DEBUG(
   1161         dbgs() << " working\n";
   1162       );
   1163 
   1164       Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk);
   1165       Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk);
   1166 
   1167       numClonedBlock += Num;
   1168       Num += serialPatternMatch(*HeadMBB->succ_begin());
   1169       Num += serialPatternMatch(*std::next(HeadMBB->succ_begin()));
   1170       Num += ifPatternMatch(HeadMBB);
   1171       assert(Num > 0);
   1172 
   1173       break;
   1174     }
   1175     DEBUG(
   1176       dbgs() << " not working\n";
   1177     );
   1178     DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr;
   1179   } // walk down the postDomTree
   1180 
   1181   return Num;
   1182 }
   1183 
   1184 void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
   1185     MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB,
   1186     MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) {
   1187   dbgs() << "head = BB" << HeadMBB->getNumber()
   1188          << " size = " << HeadMBB->size();
   1189   if (Detail) {
   1190     dbgs() << "\n";
   1191     HeadMBB->print(dbgs());
   1192     dbgs() << "\n";
   1193   }
   1194 
   1195   if (TrueMBB) {
   1196     dbgs() << ", true = BB" << TrueMBB->getNumber() << " size = "
   1197            << TrueMBB->size() << " numPred = " << TrueMBB->pred_size();
   1198     if (Detail) {
   1199       dbgs() << "\n";
   1200       TrueMBB->print(dbgs());
   1201       dbgs() << "\n";
   1202     }
   1203   }
   1204   if (FalseMBB) {
   1205     dbgs() << ", false = BB" << FalseMBB->getNumber() << " size = "
   1206            << FalseMBB->size() << " numPred = " << FalseMBB->pred_size();
   1207     if (Detail) {
   1208       dbgs() << "\n";
   1209       FalseMBB->print(dbgs());
   1210       dbgs() << "\n";
   1211     }
   1212   }
   1213   if (LandMBB) {
   1214     dbgs() << ", land = BB" << LandMBB->getNumber() << " size = "
   1215            << LandMBB->size() << " numPred = " << LandMBB->pred_size();
   1216     if (Detail) {
   1217       dbgs() << "\n";
   1218       LandMBB->print(dbgs());
   1219       dbgs() << "\n";
   1220     }
   1221   }
   1222 
   1223     dbgs() << "\n";
   1224 }
   1225 
   1226 int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
   1227     MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
   1228     MachineBasicBlock **LandMBBPtr) {
   1229   bool MigrateTrue = false;
   1230   bool MigrateFalse = false;
   1231 
   1232   MachineBasicBlock *LandBlk = *LandMBBPtr;
   1233 
   1234   assert((!TrueMBB || TrueMBB->succ_size() <= 1)
   1235          && (!FalseMBB || FalseMBB->succ_size() <= 1));
   1236 
   1237   if (TrueMBB == FalseMBB)
   1238     return 0;
   1239 
   1240   MigrateTrue = needMigrateBlock(TrueMBB);
   1241   MigrateFalse = needMigrateBlock(FalseMBB);
   1242 
   1243   if (!MigrateTrue && !MigrateFalse)
   1244     return 0;
   1245 
   1246   // If we need to migrate either trueBlk and falseBlk, migrate the rest that
   1247   // have more than one predecessors.  without doing this, its predecessor
   1248   // rather than headBlk will have undefined value in initReg.
   1249   if (!MigrateTrue && TrueMBB && TrueMBB->pred_size() > 1)
   1250     MigrateTrue = true;
   1251   if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1)
   1252     MigrateFalse = true;
   1253 
   1254   DEBUG(
   1255     dbgs() << "before improveSimpleJumpintoIf: ";
   1256     showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
   1257   );
   1258 
   1259   // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
   1260   //
   1261   // new: headBlk => if () {initReg = 1; org trueBlk branch} else
   1262   //      {initReg = 0; org falseBlk branch }
   1263   //      => landBlk => if (initReg) {org trueBlk} else {org falseBlk}
   1264   //      => org landBlk
   1265   //      if landBlk->pred_size() > 2, put the about if-else inside
   1266   //      if (initReg !=2) {...}
   1267   //
   1268   // add initReg = initVal to headBlk
   1269 
   1270   const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
   1271   if (!MigrateTrue || !MigrateFalse) {
   1272     // XXX: We have an opportunity here to optimize the "branch into if" case
   1273     // here.  Branch into if looks like this:
   1274     //                        entry
   1275     //                       /     |
   1276     //           diamond_head       branch_from
   1277     //             /      \           |
   1278     // diamond_false        diamond_true
   1279     //             \      /
   1280     //               done
   1281     //
   1282     // The diamond_head block begins the "if" and the diamond_true block
   1283     // is the block being "branched into".
   1284     //
   1285     // If MigrateTrue is true, then TrueBB is the block being "branched into"
   1286     // and if MigrateFalse is true, then FalseBB is the block being
   1287     // "branched into"
   1288     //
   1289     // Here is the pseudo code for how I think the optimization should work:
   1290     // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head.
   1291     // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
   1292     // 3. Move the branch instruction from diamond_head into its own basic
   1293     //    block (new_block).
   1294     // 4. Add an unconditional branch from diamond_head to new_block
   1295     // 5. Replace the branch instruction in branch_from with an unconditional
   1296     //    branch to new_block.  If branch_from has multiple predecessors, then
   1297     //    we need to replace the True/False block in the branch
   1298     //    instruction instead of replacing it.
   1299     // 6. Change the condition of the branch instruction in new_block from
   1300     //    COND to (COND || GPR0)
   1301     //
   1302     // In order insert these MOV instruction, we will need to use the
   1303     // RegisterScavenger.  Usually liveness stops being tracked during
   1304     // the late machine optimization passes, however if we implement
   1305     // bool TargetRegisterInfo::requiresRegisterScavenging(
   1306     //                                                const MachineFunction &MF)
   1307     // and have it return true, liveness will be tracked correctly
   1308     // by generic optimization passes.  We will also need to make sure that
   1309     // all of our target-specific passes that run after regalloc and before
   1310     // the CFGStructurizer track liveness and we will need to modify this pass
   1311     // to correctly track liveness.
   1312     //
   1313     // After the above changes, the new CFG should look like this:
   1314     //                        entry
   1315     //                       /     |
   1316     //           diamond_head       branch_from
   1317     //                       \     /
   1318     //                      new_block
   1319     //                      /      |
   1320     //         diamond_false        diamond_true
   1321     //                      \      /
   1322     //                        done
   1323     //
   1324     // Without this optimization, we are forced to duplicate the diamond_true
   1325     // block and we will end up with a CFG like this:
   1326     //
   1327     //                        entry
   1328     //                       /     |
   1329     //           diamond_head       branch_from
   1330     //             /      \                   |
   1331     // diamond_false        diamond_true      diamond_true (duplicate)
   1332     //             \      /                   |
   1333     //               done --------------------|
   1334     //
   1335     // Duplicating diamond_true can be very costly especially if it has a
   1336     // lot of instructions.
   1337     return 0;
   1338   }
   1339 
   1340   int NumNewBlk = 0;
   1341 
   1342   bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
   1343 
   1344   //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
   1345   MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF);
   1346 
   1347   if (LandBlkHasOtherPred) {
   1348     report_fatal_error("Extra register needed to handle CFG");
   1349     unsigned CmpResReg =
   1350       HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
   1351     report_fatal_error("Extra compare instruction needed to handle CFG");
   1352     insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET,
   1353         CmpResReg, DebugLoc());
   1354   }
   1355 
   1356   // XXX: We are running this after RA, so creating virtual registers will
   1357   // cause an assertion failure in the PostRA scheduling pass.
   1358   unsigned InitReg =
   1359     HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
   1360   insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg,
   1361       DebugLoc());
   1362 
   1363   if (MigrateTrue) {
   1364     migrateInstruction(TrueMBB, LandBlk, I);
   1365     // need to uncondionally insert the assignment to ensure a path from its
   1366     // predecessor rather than headBlk has valid value in initReg if
   1367     // (initVal != 1).
   1368     report_fatal_error("Extra register needed to handle CFG");
   1369   }
   1370   insertInstrBefore(I, AMDGPU::ELSE);
   1371 
   1372   if (MigrateFalse) {
   1373     migrateInstruction(FalseMBB, LandBlk, I);
   1374     // need to uncondionally insert the assignment to ensure a path from its
   1375     // predecessor rather than headBlk has valid value in initReg if
   1376     // (initVal != 0)
   1377     report_fatal_error("Extra register needed to handle CFG");
   1378   }
   1379 
   1380   if (LandBlkHasOtherPred) {
   1381     // add endif
   1382     insertInstrBefore(I, AMDGPU::ENDIF);
   1383 
   1384     // put initReg = 2 to other predecessors of landBlk
   1385     for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(),
   1386          PE = LandBlk->pred_end(); PI != PE; ++PI) {
   1387       MachineBasicBlock *MBB = *PI;
   1388       if (MBB != TrueMBB && MBB != FalseMBB)
   1389         report_fatal_error("Extra register needed to handle CFG");
   1390     }
   1391   }
   1392   DEBUG(
   1393     dbgs() << "result from improveSimpleJumpintoIf: ";
   1394     showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
   1395   );
   1396 
   1397   // update landBlk
   1398   *LandMBBPtr = LandBlk;
   1399 
   1400   return NumNewBlk;
   1401 }
   1402 
   1403 void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
   1404     MachineBasicBlock *SrcMBB) {
   1405   DEBUG(
   1406     dbgs() << "serialPattern BB" << DstMBB->getNumber()
   1407            << " <= BB" << SrcMBB->getNumber() << "\n";
   1408   );
   1409   DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end());
   1410 
   1411   DstMBB->removeSuccessor(SrcMBB, true);
   1412   cloneSuccessorList(DstMBB, SrcMBB);
   1413 
   1414   removeSuccessor(SrcMBB);
   1415   MLI->removeBlock(SrcMBB);
   1416   retireBlock(SrcMBB);
   1417 }
   1418 
   1419 void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
   1420     MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
   1421     MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) {
   1422   assert (TrueMBB);
   1423   DEBUG(
   1424     dbgs() << "ifPattern BB" << MBB->getNumber();
   1425     dbgs() << "{  ";
   1426     if (TrueMBB) {
   1427       dbgs() << "BB" << TrueMBB->getNumber();
   1428     }
   1429     dbgs() << "  } else ";
   1430     dbgs() << "{  ";
   1431     if (FalseMBB) {
   1432       dbgs() << "BB" << FalseMBB->getNumber();
   1433     }
   1434     dbgs() << "  }\n ";
   1435     dbgs() << "landBlock: ";
   1436     if (!LandMBB) {
   1437       dbgs() << "NULL";
   1438     } else {
   1439       dbgs() << "BB" << LandMBB->getNumber();
   1440     }
   1441     dbgs() << "\n";
   1442   );
   1443 
   1444   int OldOpcode = BranchMI->getOpcode();
   1445   DebugLoc BranchDL = BranchMI->getDebugLoc();
   1446 
   1447 //    transform to
   1448 //    if cond
   1449 //       trueBlk
   1450 //    else
   1451 //       falseBlk
   1452 //    endif
   1453 //    landBlk
   1454 
   1455   MachineBasicBlock::iterator I = BranchMI;
   1456   insertCondBranchBefore(I, getBranchNzeroOpcode(OldOpcode),
   1457       BranchDL);
   1458 
   1459   if (TrueMBB) {
   1460     MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end());
   1461     MBB->removeSuccessor(TrueMBB, true);
   1462     if (LandMBB && TrueMBB->succ_size()!=0)
   1463       TrueMBB->removeSuccessor(LandMBB, true);
   1464     retireBlock(TrueMBB);
   1465     MLI->removeBlock(TrueMBB);
   1466   }
   1467 
   1468   if (FalseMBB) {
   1469     insertInstrBefore(I, AMDGPU::ELSE);
   1470     MBB->splice(I, FalseMBB, FalseMBB->begin(),
   1471                    FalseMBB->end());
   1472     MBB->removeSuccessor(FalseMBB, true);
   1473     if (LandMBB && FalseMBB->succ_size() != 0)
   1474       FalseMBB->removeSuccessor(LandMBB, true);
   1475     retireBlock(FalseMBB);
   1476     MLI->removeBlock(FalseMBB);
   1477   }
   1478   insertInstrBefore(I, AMDGPU::ENDIF);
   1479 
   1480   BranchMI->eraseFromParent();
   1481 
   1482   if (LandMBB && TrueMBB && FalseMBB)
   1483     MBB->addSuccessor(LandMBB);
   1484 
   1485 }
   1486 
   1487 void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
   1488     MachineBasicBlock *LandMBB) {
   1489   DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
   1490                << " land = BB" << LandMBB->getNumber() << "\n";);
   1491 
   1492   insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
   1493   insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
   1494   DstBlk->replaceSuccessor(DstBlk, LandMBB);
   1495 }
   1496 
   1497 
   1498 void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
   1499     MachineBasicBlock *LandMBB) {
   1500   DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber()
   1501                << " land = BB" << LandMBB->getNumber() << "\n";);
   1502   MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB);
   1503   assert(BranchMI && isCondBranch(BranchMI));
   1504   DebugLoc DL = BranchMI->getDebugLoc();
   1505   MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI);
   1506   MachineBasicBlock::iterator I = BranchMI;
   1507   if (TrueBranch != LandMBB)
   1508     reversePredicateSetter(I);
   1509   insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL);
   1510   insertInstrBefore(I, AMDGPU::BREAK);
   1511   insertInstrBefore(I, AMDGPU::ENDIF);
   1512   //now branchInst can be erase safely
   1513   BranchMI->eraseFromParent();
   1514   //now take care of successors, retire blocks
   1515   ExitingMBB->removeSuccessor(LandMBB, true);
   1516 }
   1517 
   1518 void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
   1519     MachineBasicBlock *ContMBB) {
   1520   DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
   1521                << ContingMBB->getNumber()
   1522                << ", cont = BB" << ContMBB->getNumber() << "\n";);
   1523 
   1524   MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB);
   1525   if (MI) {
   1526     assert(isCondBranch(MI));
   1527     MachineBasicBlock::iterator I = MI;
   1528     MachineBasicBlock *TrueBranch = getTrueBranch(MI);
   1529     int OldOpcode = MI->getOpcode();
   1530     DebugLoc DL = MI->getDebugLoc();
   1531 
   1532     bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI);
   1533 
   1534     if (!UseContinueLogical) {
   1535       int BranchOpcode =
   1536           TrueBranch == ContMBB ? getBranchNzeroOpcode(OldOpcode) :
   1537           getBranchZeroOpcode(OldOpcode);
   1538       insertCondBranchBefore(I, BranchOpcode, DL);
   1539       // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
   1540       insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL);
   1541       insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL);
   1542     } else {
   1543       int BranchOpcode =
   1544           TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) :
   1545           getContinueZeroOpcode(OldOpcode);
   1546       insertCondBranchBefore(I, BranchOpcode, DL);
   1547     }
   1548 
   1549     MI->eraseFromParent();
   1550   } else {
   1551     // if we've arrived here then we've already erased the branch instruction
   1552     // travel back up the basic block to see the last reference of our debug
   1553     // location we've just inserted that reference here so it should be
   1554     // representative insertEnd to ensure phi-moves, if exist, go before the
   1555     // continue-instr.
   1556     insertInstrEnd(ContingMBB, AMDGPU::CONTINUE,
   1557         getLastDebugLocInBB(ContingMBB));
   1558   }
   1559 }
   1560 
   1561 int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
   1562     MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) {
   1563   int Cloned = 0;
   1564   assert(PreMBB->isSuccessor(SrcMBB));
   1565   while (SrcMBB && SrcMBB != DstMBB) {
   1566     assert(SrcMBB->succ_size() == 1);
   1567     if (SrcMBB->pred_size() > 1) {
   1568       SrcMBB = cloneBlockForPredecessor(SrcMBB, PreMBB);
   1569       ++Cloned;
   1570     }
   1571 
   1572     PreMBB = SrcMBB;
   1573     SrcMBB = *SrcMBB->succ_begin();
   1574   }
   1575 
   1576   return Cloned;
   1577 }
   1578 
   1579 MachineBasicBlock *
   1580 AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
   1581     MachineBasicBlock *PredMBB) {
   1582   assert(PredMBB->isSuccessor(MBB) &&
   1583          "succBlk is not a prececessor of curBlk");
   1584 
   1585   MachineBasicBlock *CloneMBB = clone(MBB);  //clone instructions
   1586   replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
   1587   //srcBlk, oldBlk, newBlk
   1588 
   1589   PredMBB->replaceSuccessor(MBB, CloneMBB);
   1590 
   1591   // add all successor to cloneBlk
   1592   cloneSuccessorList(CloneMBB, MBB);
   1593 
   1594   numClonedInstr += MBB->size();
   1595 
   1596   DEBUG(
   1597     dbgs() << "Cloned block: " << "BB"
   1598            << MBB->getNumber() << "size " << MBB->size() << "\n";
   1599   );
   1600 
   1601   SHOWNEWBLK(CloneMBB, "result of Cloned block: ");
   1602 
   1603   return CloneMBB;
   1604 }
   1605 
   1606 void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
   1607     MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) {
   1608   MachineBasicBlock::iterator SpliceEnd;
   1609   //look for the input branchinstr, not the AMDGPU branchinstr
   1610   MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB);
   1611   if (!BranchMI) {
   1612     DEBUG(
   1613       dbgs() << "migrateInstruction don't see branch instr\n" ;
   1614     );
   1615     SpliceEnd = SrcMBB->end();
   1616   } else {
   1617     DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI);
   1618     SpliceEnd = BranchMI;
   1619   }
   1620   DEBUG(
   1621     dbgs() << "migrateInstruction before splice dstSize = " << DstMBB->size()
   1622       << "srcSize = " << SrcMBB->size() << "\n";
   1623   );
   1624 
   1625   //splice insert before insertPos
   1626   DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd);
   1627 
   1628   DEBUG(
   1629     dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size()
   1630       << "srcSize = " << SrcMBB->size() << '\n';
   1631   );
   1632 }
   1633 
   1634 MachineBasicBlock *
   1635 AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
   1636   MachineBasicBlock *LoopHeader = LoopRep->getHeader();
   1637   MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch();
   1638 
   1639   if (!LoopHeader || !LoopLatch)
   1640     return nullptr;
   1641   MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch);
   1642   // Is LoopRep an infinite loop ?
   1643   if (!BranchMI || !isUncondBranch(BranchMI))
   1644     return nullptr;
   1645 
   1646   MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
   1647   FuncRep->push_back(DummyExitBlk);  //insert to function
   1648   SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
   1649   DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
   1650   LLVMContext &Ctx = LoopHeader->getParent()->getFunction()->getContext();
   1651   Ctx.emitError("Extra register needed to handle CFG");
   1652   return nullptr;
   1653 }
   1654 
   1655 void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
   1656   MachineInstr *BranchMI;
   1657 
   1658   // I saw two unconditional branch in one basic block in example
   1659   // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
   1660   while ((BranchMI = getLoopendBlockBranchInstr(MBB))
   1661           && isUncondBranch(BranchMI)) {
   1662     DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI);
   1663     BranchMI->eraseFromParent();
   1664   }
   1665 }
   1666 
   1667 void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
   1668     MachineBasicBlock *MBB) {
   1669   if (MBB->succ_size() != 2)
   1670     return;
   1671   MachineBasicBlock *MBB1 = *MBB->succ_begin();
   1672   MachineBasicBlock *MBB2 = *std::next(MBB->succ_begin());
   1673   if (MBB1 != MBB2)
   1674     return;
   1675 
   1676   MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
   1677   assert(BranchMI && isCondBranch(BranchMI));
   1678   DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI);
   1679   BranchMI->eraseFromParent();
   1680   SHOWNEWBLK(MBB1, "Removing redundant successor");
   1681   MBB->removeSuccessor(MBB1, true);
   1682 }
   1683 
   1684 void AMDGPUCFGStructurizer::addDummyExitBlock(
   1685     SmallVectorImpl<MachineBasicBlock*> &RetMBB) {
   1686   MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
   1687   FuncRep->push_back(DummyExitBlk);  //insert to function
   1688   insertInstrEnd(DummyExitBlk, AMDGPU::RETURN);
   1689 
   1690   for (SmallVectorImpl<MachineBasicBlock *>::iterator It = RetMBB.begin(),
   1691        E = RetMBB.end(); It != E; ++It) {
   1692     MachineBasicBlock *MBB = *It;
   1693     MachineInstr *MI = getReturnInstr(MBB);
   1694     if (MI)
   1695       MI->eraseFromParent();
   1696     MBB->addSuccessor(DummyExitBlk);
   1697     DEBUG(
   1698       dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
   1699              << " successors\n";
   1700     );
   1701   }
   1702   SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: ");
   1703 }
   1704 
   1705 void AMDGPUCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) {
   1706   while (MBB->succ_size())
   1707     MBB->removeSuccessor(*MBB->succ_begin());
   1708 }
   1709 
   1710 void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
   1711     int SccNum) {
   1712   BlockInformation *&srcBlkInfo = BlockInfoMap[MBB];
   1713   if (!srcBlkInfo)
   1714     srcBlkInfo = new BlockInformation();
   1715   srcBlkInfo->SccNum = SccNum;
   1716 }
   1717 
   1718 void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
   1719   DEBUG(
   1720         dbgs() << "Retiring BB" << MBB->getNumber() << "\n";
   1721   );
   1722 
   1723   BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB];
   1724 
   1725   if (!SrcBlkInfo)
   1726     SrcBlkInfo = new BlockInformation();
   1727 
   1728   SrcBlkInfo->IsRetired = true;
   1729   assert(MBB->succ_size() == 0 && MBB->pred_size() == 0
   1730          && "can't retire block yet");
   1731 }
   1732 
   1733 char AMDGPUCFGStructurizer::ID = 0;
   1734 
   1735 } // end anonymous namespace
   1736 
   1737 
   1738 INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer",
   1739                       "AMDGPU CFG Structurizer", false, false)
   1740 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
   1741 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
   1742 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
   1743 INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer",
   1744                       "AMDGPU CFG Structurizer", false, false)
   1745 
   1746 FunctionPass *llvm::createAMDGPUCFGStructurizerPass() {
   1747   return new AMDGPUCFGStructurizer();
   1748 }
   1749