Home | History | Annotate | Download | only in AMDGPU
      1 //===- AMDILCFGStructurizer.cpp - CFG Structurizer ------------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //==-----------------------------------------------------------------------===//
      9 
     10 #include "AMDGPU.h"
     11 #include "AMDGPUSubtarget.h"
     12 #include "R600InstrInfo.h"
     13 #include "R600RegisterInfo.h"
     14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
     15 #include "llvm/ADT/DepthFirstIterator.h"
     16 #include "llvm/ADT/SCCIterator.h"
     17 #include "llvm/ADT/SmallPtrSet.h"
     18 #include "llvm/ADT/SmallVector.h"
     19 #include "llvm/ADT/Statistic.h"
     20 #include "llvm/ADT/StringRef.h"
     21 #include "llvm/CodeGen/MachineBasicBlock.h"
     22 #include "llvm/CodeGen/MachineDominators.h"
     23 #include "llvm/CodeGen/MachineFunction.h"
     24 #include "llvm/CodeGen/MachineFunctionPass.h"
     25 #include "llvm/CodeGen/MachineInstr.h"
     26 #include "llvm/CodeGen/MachineInstrBuilder.h"
     27 #include "llvm/CodeGen/MachineJumpTableInfo.h"
     28 #include "llvm/CodeGen/MachineLoopInfo.h"
     29 #include "llvm/CodeGen/MachineOperand.h"
     30 #include "llvm/CodeGen/MachinePostDominators.h"
     31 #include "llvm/CodeGen/MachineRegisterInfo.h"
     32 #include "llvm/IR/DebugLoc.h"
     33 #include "llvm/IR/LLVMContext.h"
     34 #include "llvm/Pass.h"
     35 #include "llvm/Support/Debug.h"
     36 #include "llvm/Support/ErrorHandling.h"
     37 #include "llvm/Support/MachineValueType.h"
     38 #include "llvm/Support/raw_ostream.h"
     39 #include <cassert>
     40 #include <cstddef>
     41 #include <deque>
     42 #include <iterator>
     43 #include <map>
     44 #include <utility>
     45 #include <vector>
     46 
     47 using namespace llvm;
     48 
     49 #define DEBUG_TYPE "structcfg"
     50 
     51 #define DEFAULT_VEC_SLOTS 8
     52 
     53 // TODO: move-begin.
     54 
     55 //===----------------------------------------------------------------------===//
     56 //
     57 // Statistics for CFGStructurizer.
     58 //
     59 //===----------------------------------------------------------------------===//
     60 
     61 STATISTIC(numSerialPatternMatch,    "CFGStructurizer number of serial pattern "
     62     "matched");
     63 STATISTIC(numIfPatternMatch,        "CFGStructurizer number of if pattern "
     64     "matched");
     65 STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
     66 STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
     67 
     68 namespace llvm {
     69 
     70 void initializeAMDGPUCFGStructurizerPass(PassRegistry &);
     71 
     72 } // end namespace llvm
     73 
     74 namespace {
     75 
     76 //===----------------------------------------------------------------------===//
     77 //
     78 // Miscellaneous utility for CFGStructurizer.
     79 //
     80 //===----------------------------------------------------------------------===//
     81 
     82 #define SHOWNEWINSTR(i) LLVM_DEBUG(dbgs() << "New instr: " << *i << "\n");
     83 
     84 #define SHOWNEWBLK(b, msg)                                                     \
     85   LLVM_DEBUG(dbgs() << msg << "BB" << b->getNumber() << "size " << b->size();  \
     86              dbgs() << "\n";);
     87 
     88 #define SHOWBLK_DETAIL(b, msg)                                                 \
     89   LLVM_DEBUG(if (b) {                                                          \
     90     dbgs() << msg << "BB" << b->getNumber() << "size " << b->size();           \
     91     b->print(dbgs());                                                          \
     92     dbgs() << "\n";                                                            \
     93   });
     94 
     95 #define INVALIDSCCNUM -1
     96 
     97 //===----------------------------------------------------------------------===//
     98 //
     99 // supporting data structure for CFGStructurizer
    100 //
    101 //===----------------------------------------------------------------------===//
    102 
    103 class BlockInformation {
    104 public:
    105   bool IsRetired = false;
    106   int SccNum = INVALIDSCCNUM;
    107 
    108   BlockInformation() = default;
    109 };
    110 
    111 //===----------------------------------------------------------------------===//
    112 //
    113 // CFGStructurizer
    114 //
    115 //===----------------------------------------------------------------------===//
    116 
    117 class AMDGPUCFGStructurizer : public MachineFunctionPass {
    118 public:
    119   using MBBVector = SmallVector<MachineBasicBlock *, 32>;
    120   using MBBInfoMap = std::map<MachineBasicBlock *, BlockInformation *>;
    121   using LoopLandInfoMap = std::map<MachineLoop *, MachineBasicBlock *>;
    122 
    123   enum PathToKind {
    124     Not_SinglePath = 0,
    125     SinglePath_InPath = 1,
    126     SinglePath_NotInPath = 2
    127   };
    128 
    129   static char ID;
    130 
    131   AMDGPUCFGStructurizer() : MachineFunctionPass(ID) {
    132     initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
    133   }
    134 
    135   StringRef getPassName() const override {
    136     return "AMDGPU Control Flow Graph structurizer Pass";
    137   }
    138 
    139   void getAnalysisUsage(AnalysisUsage &AU) const override {
    140     AU.addRequired<MachineDominatorTree>();
    141     AU.addRequired<MachinePostDominatorTree>();
    142     AU.addRequired<MachineLoopInfo>();
    143     MachineFunctionPass::getAnalysisUsage(AU);
    144   }
    145 
    146   /// Perform the CFG structurization
    147   bool run();
    148 
    149   /// Perform the CFG preparation
    150   /// This step will remove every unconditionnal/dead jump instructions and make
    151   /// sure all loops have an exit block
    152   bool prepare();
    153 
    154   bool runOnMachineFunction(MachineFunction &MF) override {
    155     TII = MF.getSubtarget<R600Subtarget>().getInstrInfo();
    156     TRI = &TII->getRegisterInfo();
    157     LLVM_DEBUG(MF.dump(););
    158     OrderedBlks.clear();
    159     Visited.clear();
    160     FuncRep = &MF;
    161     MLI = &getAnalysis<MachineLoopInfo>();
    162     LLVM_DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
    163     MDT = &getAnalysis<MachineDominatorTree>();
    164     LLVM_DEBUG(MDT->print(dbgs(), (const Module *)nullptr););
    165     PDT = &getAnalysis<MachinePostDominatorTree>();
    166     LLVM_DEBUG(PDT->print(dbgs()););
    167     prepare();
    168     run();
    169     LLVM_DEBUG(MF.dump(););
    170     return true;
    171   }
    172 
    173 protected:
    174   MachineDominatorTree *MDT;
    175   MachinePostDominatorTree *PDT;
    176   MachineLoopInfo *MLI;
    177   const R600InstrInfo *TII = nullptr;
    178   const R600RegisterInfo *TRI = nullptr;
    179 
    180   // PRINT FUNCTIONS
    181   /// Print the ordered Blocks.
    182   void printOrderedBlocks() const {
    183     size_t i = 0;
    184     for (MBBVector::const_iterator iterBlk = OrderedBlks.begin(),
    185         iterBlkEnd = OrderedBlks.end(); iterBlk != iterBlkEnd; ++iterBlk, ++i) {
    186       dbgs() << "BB" << (*iterBlk)->getNumber();
    187       dbgs() << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
    188       if (i != 0 && i % 10 == 0) {
    189         dbgs() << "\n";
    190       } else {
    191         dbgs() << " ";
    192       }
    193     }
    194   }
    195 
    196   static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) {
    197     for (MachineLoop::iterator iter = LoopInfo.begin(),
    198          iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) {
    199       (*iter)->print(dbgs(), 0);
    200     }
    201   }
    202 
    203   // UTILITY FUNCTIONS
    204   int getSCCNum(MachineBasicBlock *MBB) const;
    205   MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const;
    206   bool hasBackEdge(MachineBasicBlock *MBB) const;
    207   bool isRetiredBlock(MachineBasicBlock *MBB) const;
    208   bool isActiveLoophead(MachineBasicBlock *MBB) const;
    209   PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
    210       bool AllowSideEntry = true) const;
    211   int countActiveBlock(MBBVector::const_iterator It,
    212       MBBVector::const_iterator E) const;
    213   bool needMigrateBlock(MachineBasicBlock *MBB) const;
    214 
    215   // Utility Functions
    216   void reversePredicateSetter(MachineBasicBlock::iterator I,
    217                               MachineBasicBlock &MBB);
    218   /// Compute the reversed DFS post order of Blocks
    219   void orderBlocks(MachineFunction *MF);
    220 
    221   // Function originally from CFGStructTraits
    222   void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode,
    223                       const DebugLoc &DL = DebugLoc());
    224   MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode,
    225                                   const DebugLoc &DL = DebugLoc());
    226   MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode);
    227   void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode,
    228                               const DebugLoc &DL);
    229   void insertCondBranchBefore(MachineBasicBlock *MBB,
    230                               MachineBasicBlock::iterator I, int NewOpcode,
    231                               int RegNum, const DebugLoc &DL);
    232 
    233   static int getBranchNzeroOpcode(int OldOpcode);
    234   static int getBranchZeroOpcode(int OldOpcode);
    235   static int getContinueNzeroOpcode(int OldOpcode);
    236   static int getContinueZeroOpcode(int OldOpcode);
    237   static MachineBasicBlock *getTrueBranch(MachineInstr *MI);
    238   static void setTrueBranch(MachineInstr *MI, MachineBasicBlock *MBB);
    239   static MachineBasicBlock *getFalseBranch(MachineBasicBlock *MBB,
    240       MachineInstr *MI);
    241   static bool isCondBranch(MachineInstr *MI);
    242   static bool isUncondBranch(MachineInstr *MI);
    243   static DebugLoc getLastDebugLocInBB(MachineBasicBlock *MBB);
    244   static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *MBB);
    245 
    246   /// The correct naming for this is getPossibleLoopendBlockBranchInstr.
    247   ///
    248   /// BB with backward-edge could have move instructions after the branch
    249   /// instruction.  Such move instruction "belong to" the loop backward-edge.
    250   MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB);
    251 
    252   static MachineInstr *getReturnInstr(MachineBasicBlock *MBB);
    253   static bool isReturnBlock(MachineBasicBlock *MBB);
    254   static void cloneSuccessorList(MachineBasicBlock *DstMBB,
    255       MachineBasicBlock *SrcMBB);
    256   static MachineBasicBlock *clone(MachineBasicBlock *MBB);
    257 
    258   /// MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose
    259   /// because the AMDGPU instruction is not recognized as terminator fix this
    260   /// and retire this routine
    261   void replaceInstrUseOfBlockWith(MachineBasicBlock *SrcMBB,
    262       MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk);
    263 
    264   static void wrapup(MachineBasicBlock *MBB);
    265 
    266   int patternMatch(MachineBasicBlock *MBB);
    267   int patternMatchGroup(MachineBasicBlock *MBB);
    268   int serialPatternMatch(MachineBasicBlock *MBB);
    269   int ifPatternMatch(MachineBasicBlock *MBB);
    270   int loopendPatternMatch();
    271   int mergeLoop(MachineLoop *LoopRep);
    272 
    273   /// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in
    274   /// the same loop with LoopLandInfo without explicitly keeping track of
    275   /// loopContBlks and loopBreakBlks, this is a method to get the information.
    276   bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB,
    277       MachineBasicBlock *Src2MBB);
    278   int handleJumpintoIf(MachineBasicBlock *HeadMBB,
    279       MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
    280   int handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
    281       MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
    282   int improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
    283       MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
    284       MachineBasicBlock **LandMBBPtr);
    285   void showImproveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
    286       MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
    287       MachineBasicBlock *LandMBB, bool Detail = false);
    288   int cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
    289       MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB);
    290   void mergeSerialBlock(MachineBasicBlock *DstMBB,
    291       MachineBasicBlock *SrcMBB);
    292 
    293   void mergeIfthenelseBlock(MachineInstr *BranchMI,
    294       MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
    295       MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB);
    296   void mergeLooplandBlock(MachineBasicBlock *DstMBB,
    297       MachineBasicBlock *LandMBB);
    298   void mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
    299       MachineBasicBlock *LandMBB);
    300   void settleLoopcontBlock(MachineBasicBlock *ContingMBB,
    301       MachineBasicBlock *ContMBB);
    302 
    303   /// normalizeInfiniteLoopExit change
    304   ///   B1:
    305   ///        uncond_br LoopHeader
    306   ///
    307   /// to
    308   ///   B1:
    309   ///        cond_br 1 LoopHeader dummyExit
    310   /// and return the newly added dummy exit block
    311   MachineBasicBlock *normalizeInfiniteLoopExit(MachineLoop *LoopRep);
    312   void removeUnconditionalBranch(MachineBasicBlock *MBB);
    313 
    314   /// Remove duplicate branches instructions in a block.
    315   /// For instance
    316   /// B0:
    317   ///    cond_br X B1 B2
    318   ///    cond_br X B1 B2
    319   /// is transformed to
    320   /// B0:
    321   ///    cond_br X B1 B2
    322   void removeRedundantConditionalBranch(MachineBasicBlock *MBB);
    323 
    324   void addDummyExitBlock(SmallVectorImpl<MachineBasicBlock *> &RetMBB);
    325   void removeSuccessor(MachineBasicBlock *MBB);
    326   MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB,
    327       MachineBasicBlock *PredMBB);
    328   void migrateInstruction(MachineBasicBlock *SrcMBB,
    329       MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
    330   void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
    331   void retireBlock(MachineBasicBlock *MBB);
    332 
    333 private:
    334   MBBInfoMap BlockInfoMap;
    335   LoopLandInfoMap LLInfoMap;
    336   std::map<MachineLoop *, bool> Visited;
    337   MachineFunction *FuncRep;
    338   SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks;
    339 };
    340 
    341 } // end anonymous namespace
    342 
    343 char AMDGPUCFGStructurizer::ID = 0;
    344 
    345 int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const {
    346   MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
    347   if (It == BlockInfoMap.end())
    348     return INVALIDSCCNUM;
    349   return (*It).second->SccNum;
    350 }
    351 
    352 MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
    353     const {
    354   LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep);
    355   if (It == LLInfoMap.end())
    356     return nullptr;
    357   return (*It).second;
    358 }
    359 
    360 bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const {
    361   MachineLoop *LoopRep = MLI->getLoopFor(MBB);
    362   if (!LoopRep)
    363     return false;
    364   MachineBasicBlock *LoopHeader = LoopRep->getHeader();
    365   return MBB->isSuccessor(LoopHeader);
    366 }
    367 
    368 bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const {
    369   MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
    370   if (It == BlockInfoMap.end())
    371     return false;
    372   return (*It).second->IsRetired;
    373 }
    374 
    375 bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const {
    376   MachineLoop *LoopRep = MLI->getLoopFor(MBB);
    377   while (LoopRep && LoopRep->getHeader() == MBB) {
    378     MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep);
    379     if(!LoopLand)
    380       return true;
    381     if (!isRetiredBlock(LoopLand))
    382       return true;
    383     LoopRep = LoopRep->getParentLoop();
    384   }
    385   return false;
    386 }
    387 
    388 AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo(
    389     MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
    390     bool AllowSideEntry) const {
    391   assert(DstMBB);
    392   if (SrcMBB == DstMBB)
    393     return SinglePath_InPath;
    394   while (SrcMBB && SrcMBB->succ_size() == 1) {
    395     SrcMBB = *SrcMBB->succ_begin();
    396     if (SrcMBB == DstMBB)
    397       return SinglePath_InPath;
    398     if (!AllowSideEntry && SrcMBB->pred_size() > 1)
    399       return Not_SinglePath;
    400   }
    401   if (SrcMBB && SrcMBB->succ_size()==0)
    402     return SinglePath_NotInPath;
    403   return Not_SinglePath;
    404 }
    405 
    406 int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It,
    407     MBBVector::const_iterator E) const {
    408   int Count = 0;
    409   while (It != E) {
    410     if (!isRetiredBlock(*It))
    411       ++Count;
    412     ++It;
    413   }
    414   return Count;
    415 }
    416 
    417 bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
    418   unsigned BlockSizeThreshold = 30;
    419   unsigned CloneInstrThreshold = 100;
    420   bool MultiplePreds = MBB && (MBB->pred_size() > 1);
    421 
    422   if(!MultiplePreds)
    423     return false;
    424   unsigned BlkSize = MBB->size();
    425   return ((BlkSize > BlockSizeThreshold) &&
    426       (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold));
    427 }
    428 
    429 void AMDGPUCFGStructurizer::reversePredicateSetter(
    430     MachineBasicBlock::iterator I, MachineBasicBlock &MBB) {
    431   assert(I.isValid() && "Expected valid iterator");
    432   for (;; --I) {
    433     if (I == MBB.end())
    434       continue;
    435     if (I->getOpcode() == R600::PRED_X) {
    436       switch (I->getOperand(2).getImm()) {
    437       case R600::PRED_SETE_INT:
    438         I->getOperand(2).setImm(R600::PRED_SETNE_INT);
    439         return;
    440       case R600::PRED_SETNE_INT:
    441         I->getOperand(2).setImm(R600::PRED_SETE_INT);
    442         return;
    443       case R600::PRED_SETE:
    444         I->getOperand(2).setImm(R600::PRED_SETNE);
    445         return;
    446       case R600::PRED_SETNE:
    447         I->getOperand(2).setImm(R600::PRED_SETE);
    448         return;
    449       default:
    450         llvm_unreachable("PRED_X Opcode invalid!");
    451       }
    452     }
    453   }
    454 }
    455 
    456 void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB,
    457                                            int NewOpcode, const DebugLoc &DL) {
    458   MachineInstr *MI =
    459       MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
    460   MBB->push_back(MI);
    461   //assume the instruction doesn't take any reg operand ...
    462   SHOWNEWINSTR(MI);
    463 }
    464 
    465 MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
    466                                                        int NewOpcode,
    467                                                        const DebugLoc &DL) {
    468   MachineInstr *MI =
    469       MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
    470   if (MBB->begin() != MBB->end())
    471     MBB->insert(MBB->begin(), MI);
    472   else
    473     MBB->push_back(MI);
    474   SHOWNEWINSTR(MI);
    475   return MI;
    476 }
    477 
    478 MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(
    479     MachineBasicBlock::iterator I, int NewOpcode) {
    480   MachineInstr *OldMI = &(*I);
    481   MachineBasicBlock *MBB = OldMI->getParent();
    482   MachineInstr *NewMBB =
    483       MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
    484   MBB->insert(I, NewMBB);
    485   //assume the instruction doesn't take any reg operand ...
    486   SHOWNEWINSTR(NewMBB);
    487   return NewMBB;
    488 }
    489 
    490 void AMDGPUCFGStructurizer::insertCondBranchBefore(
    491     MachineBasicBlock::iterator I, int NewOpcode, const DebugLoc &DL) {
    492   MachineInstr *OldMI = &(*I);
    493   MachineBasicBlock *MBB = OldMI->getParent();
    494   MachineFunction *MF = MBB->getParent();
    495   MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
    496   MBB->insert(I, NewMI);
    497   MachineInstrBuilder MIB(*MF, NewMI);
    498   MIB.addReg(OldMI->getOperand(1).getReg(), false);
    499   SHOWNEWINSTR(NewMI);
    500   //erase later oldInstr->eraseFromParent();
    501 }
    502 
    503 void AMDGPUCFGStructurizer::insertCondBranchBefore(
    504     MachineBasicBlock *blk, MachineBasicBlock::iterator I, int NewOpcode,
    505     int RegNum, const DebugLoc &DL) {
    506   MachineFunction *MF = blk->getParent();
    507   MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
    508   //insert before
    509   blk->insert(I, NewInstr);
    510   MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
    511   SHOWNEWINSTR(NewInstr);
    512 }
    513 
    514 int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
    515   switch(OldOpcode) {
    516   case R600::JUMP_COND:
    517   case R600::JUMP: return R600::IF_PREDICATE_SET;
    518   case R600::BRANCH_COND_i32:
    519   case R600::BRANCH_COND_f32: return R600::IF_LOGICALNZ_f32;
    520   default: llvm_unreachable("internal error");
    521   }
    522   return -1;
    523 }
    524 
    525 int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
    526   switch(OldOpcode) {
    527   case R600::JUMP_COND:
    528   case R600::JUMP: return R600::IF_PREDICATE_SET;
    529   case R600::BRANCH_COND_i32:
    530   case R600::BRANCH_COND_f32: return R600::IF_LOGICALZ_f32;
    531   default: llvm_unreachable("internal error");
    532   }
    533   return -1;
    534 }
    535 
    536 int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
    537   switch(OldOpcode) {
    538   case R600::JUMP_COND:
    539   case R600::JUMP: return R600::CONTINUE_LOGICALNZ_i32;
    540   default: llvm_unreachable("internal error");
    541   }
    542   return -1;
    543 }
    544 
    545 int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
    546   switch(OldOpcode) {
    547   case R600::JUMP_COND:
    548   case R600::JUMP: return R600::CONTINUE_LOGICALZ_i32;
    549   default: llvm_unreachable("internal error");
    550   }
    551   return -1;
    552 }
    553 
    554 MachineBasicBlock *AMDGPUCFGStructurizer::getTrueBranch(MachineInstr *MI) {
    555   return MI->getOperand(0).getMBB();
    556 }
    557 
    558 void AMDGPUCFGStructurizer::setTrueBranch(MachineInstr *MI,
    559     MachineBasicBlock *MBB) {
    560   MI->getOperand(0).setMBB(MBB);
    561 }
    562 
    563 MachineBasicBlock *
    564 AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
    565     MachineInstr *MI) {
    566   assert(MBB->succ_size() == 2);
    567   MachineBasicBlock *TrueBranch = getTrueBranch(MI);
    568   MachineBasicBlock::succ_iterator It = MBB->succ_begin();
    569   MachineBasicBlock::succ_iterator Next = It;
    570   ++Next;
    571   return (*It == TrueBranch) ? *Next : *It;
    572 }
    573 
    574 bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
    575   switch (MI->getOpcode()) {
    576     case R600::JUMP_COND:
    577     case R600::BRANCH_COND_i32:
    578     case R600::BRANCH_COND_f32: return true;
    579   default:
    580     return false;
    581   }
    582   return false;
    583 }
    584 
    585 bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) {
    586   switch (MI->getOpcode()) {
    587   case R600::JUMP:
    588   case R600::BRANCH:
    589     return true;
    590   default:
    591     return false;
    592   }
    593   return false;
    594 }
    595 
    596 DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) {
    597   //get DebugLoc from the first MachineBasicBlock instruction with debug info
    598   DebugLoc DL;
    599   for (MachineBasicBlock::iterator It = MBB->begin(); It != MBB->end();
    600       ++It) {
    601     MachineInstr *instr = &(*It);
    602     if (instr->getDebugLoc())
    603       DL = instr->getDebugLoc();
    604   }
    605   return DL;
    606 }
    607 
    608 MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr(
    609     MachineBasicBlock *MBB) {
    610   MachineBasicBlock::reverse_iterator It = MBB->rbegin();
    611   MachineInstr *MI = &*It;
    612   if (MI && (isCondBranch(MI) || isUncondBranch(MI)))
    613     return MI;
    614   return nullptr;
    615 }
    616 
    617 MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
    618     MachineBasicBlock *MBB) {
    619   for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend();
    620       It != E; ++It) {
    621     // FIXME: Simplify
    622     MachineInstr *MI = &*It;
    623     if (MI) {
    624       if (isCondBranch(MI) || isUncondBranch(MI))
    625         return MI;
    626       else if (!TII->isMov(MI->getOpcode()))
    627         break;
    628     }
    629   }
    630   return nullptr;
    631 }
    632 
    633 MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
    634   MachineBasicBlock::reverse_iterator It = MBB->rbegin();
    635   if (It != MBB->rend()) {
    636     MachineInstr *instr = &(*It);
    637     if (instr->getOpcode() == R600::RETURN)
    638       return instr;
    639   }
    640   return nullptr;
    641 }
    642 
    643 bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
    644   MachineInstr *MI = getReturnInstr(MBB);
    645   bool IsReturn = (MBB->succ_size() == 0);
    646   if (MI)
    647     assert(IsReturn);
    648   else if (IsReturn)
    649     LLVM_DEBUG(dbgs() << "BB" << MBB->getNumber()
    650                       << " is return block without RETURN instr\n";);
    651   return  IsReturn;
    652 }
    653 
    654 void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB,
    655     MachineBasicBlock *SrcMBB) {
    656   for (MachineBasicBlock::succ_iterator It = SrcMBB->succ_begin(),
    657        iterEnd = SrcMBB->succ_end(); It != iterEnd; ++It)
    658     DstMBB->addSuccessor(*It);  // *iter's predecessor is also taken care of
    659 }
    660 
    661 MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) {
    662   MachineFunction *Func = MBB->getParent();
    663   MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock();
    664   Func->push_back(NewMBB);  //insert to function
    665   for (const MachineInstr &It : *MBB)
    666     NewMBB->push_back(Func->CloneMachineInstr(&It));
    667   return NewMBB;
    668 }
    669 
    670 void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith(
    671     MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB,
    672     MachineBasicBlock *NewBlk) {
    673   MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB);
    674   if (BranchMI && isCondBranch(BranchMI) &&
    675       getTrueBranch(BranchMI) == OldMBB)
    676     setTrueBranch(BranchMI, NewBlk);
    677 }
    678 
    679 void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
    680   assert((!MBB->getParent()->getJumpTableInfo()
    681           || MBB->getParent()->getJumpTableInfo()->isEmpty())
    682          && "found a jump table");
    683 
    684    //collect continue right before endloop
    685    SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> ContInstr;
    686    MachineBasicBlock::iterator Pre = MBB->begin();
    687    MachineBasicBlock::iterator E = MBB->end();
    688    MachineBasicBlock::iterator It = Pre;
    689    while (It != E) {
    690      if (Pre->getOpcode() == R600::CONTINUE
    691          && It->getOpcode() == R600::ENDLOOP)
    692        ContInstr.push_back(&*Pre);
    693      Pre = It;
    694      ++It;
    695    }
    696 
    697    //delete continue right before endloop
    698    for (unsigned i = 0; i < ContInstr.size(); ++i)
    699       ContInstr[i]->eraseFromParent();
    700 
    701    // TODO to fix up jump table so later phase won't be confused.  if
    702    // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
    703    // there isn't such an interface yet.  alternatively, replace all the other
    704    // blocks in the jump table with the entryBlk //}
    705 }
    706 
    707 bool AMDGPUCFGStructurizer::prepare() {
    708   bool Changed = false;
    709 
    710   //FIXME: if not reducible flow graph, make it so ???
    711 
    712   LLVM_DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);
    713 
    714   orderBlocks(FuncRep);
    715 
    716   SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> RetBlks;
    717 
    718   // Add an ExitBlk to loop that don't have one
    719   for (MachineLoopInfo::iterator It = MLI->begin(),
    720        E = MLI->end(); It != E; ++It) {
    721     MachineLoop *LoopRep = (*It);
    722     MBBVector ExitingMBBs;
    723     LoopRep->getExitingBlocks(ExitingMBBs);
    724 
    725     if (ExitingMBBs.size() == 0) {
    726       MachineBasicBlock* DummyExitBlk = normalizeInfiniteLoopExit(LoopRep);
    727       if (DummyExitBlk)
    728         RetBlks.push_back(DummyExitBlk);
    729     }
    730   }
    731 
    732   // Remove unconditional branch instr.
    733   // Add dummy exit block iff there are multiple returns.
    734   for (SmallVectorImpl<MachineBasicBlock *>::const_iterator
    735        It = OrderedBlks.begin(), E = OrderedBlks.end(); It != E; ++It) {
    736     MachineBasicBlock *MBB = *It;
    737     removeUnconditionalBranch(MBB);
    738     removeRedundantConditionalBranch(MBB);
    739     if (isReturnBlock(MBB)) {
    740       RetBlks.push_back(MBB);
    741     }
    742     assert(MBB->succ_size() <= 2);
    743   }
    744 
    745   if (RetBlks.size() >= 2) {
    746     addDummyExitBlock(RetBlks);
    747     Changed = true;
    748   }
    749 
    750   return Changed;
    751 }
    752 
    753 bool AMDGPUCFGStructurizer::run() {
    754   //Assume reducible CFG...
    755   LLVM_DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
    756 
    757 #ifdef STRESSTEST
    758   //Use the worse block ordering to test the algorithm.
    759   ReverseVector(orderedBlks);
    760 #endif
    761 
    762   LLVM_DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks(););
    763   int NumIter = 0;
    764   bool Finish = false;
    765   MachineBasicBlock *MBB;
    766   bool MakeProgress = false;
    767   int NumRemainedBlk = countActiveBlock(OrderedBlks.begin(),
    768                                         OrderedBlks.end());
    769 
    770   do {
    771     ++NumIter;
    772     LLVM_DEBUG(dbgs() << "numIter = " << NumIter
    773                       << ", numRemaintedBlk = " << NumRemainedBlk << "\n";);
    774 
    775     SmallVectorImpl<MachineBasicBlock *>::const_iterator It =
    776         OrderedBlks.begin();
    777     SmallVectorImpl<MachineBasicBlock *>::const_iterator E =
    778         OrderedBlks.end();
    779 
    780     SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter =
    781         It;
    782     MachineBasicBlock *SccBeginMBB = nullptr;
    783     int SccNumBlk = 0;  // The number of active blocks, init to a
    784                         // maximum possible number.
    785     int SccNumIter;     // Number of iteration in this SCC.
    786 
    787     while (It != E) {
    788       MBB = *It;
    789 
    790       if (!SccBeginMBB) {
    791         SccBeginIter = It;
    792         SccBeginMBB = MBB;
    793         SccNumIter = 0;
    794         SccNumBlk = NumRemainedBlk; // Init to maximum possible number.
    795         LLVM_DEBUG(dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB);
    796                    dbgs() << "\n";);
    797       }
    798 
    799       if (!isRetiredBlock(MBB))
    800         patternMatch(MBB);
    801 
    802       ++It;
    803 
    804       bool ContNextScc = true;
    805       if (It == E
    806           || getSCCNum(SccBeginMBB) != getSCCNum(*It)) {
    807         // Just finish one scc.
    808         ++SccNumIter;
    809         int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It);
    810         if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) {
    811           LLVM_DEBUG(dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
    812                             << ", sccNumIter = " << SccNumIter;
    813                      dbgs() << "doesn't make any progress\n";);
    814           ContNextScc = true;
    815         } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) {
    816           SccNumBlk = sccRemainedNumBlk;
    817           It = SccBeginIter;
    818           ContNextScc = false;
    819           LLVM_DEBUG(dbgs() << "repeat processing SCC" << getSCCNum(MBB)
    820                             << "sccNumIter = " << SccNumIter << '\n';);
    821         } else {
    822           // Finish the current scc.
    823           ContNextScc = true;
    824         }
    825       } else {
    826         // Continue on next component in the current scc.
    827         ContNextScc = false;
    828       }
    829 
    830       if (ContNextScc)
    831         SccBeginMBB = nullptr;
    832     } //while, "one iteration" over the function.
    833 
    834     MachineBasicBlock *EntryMBB =
    835         *GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
    836     if (EntryMBB->succ_size() == 0) {
    837       Finish = true;
    838       LLVM_DEBUG(dbgs() << "Reduce to one block\n";);
    839     } else {
    840       int NewnumRemainedBlk
    841         = countActiveBlock(OrderedBlks.begin(), OrderedBlks.end());
    842       // consider cloned blocks ??
    843       if (NewnumRemainedBlk == 1 || NewnumRemainedBlk < NumRemainedBlk) {
    844         MakeProgress = true;
    845         NumRemainedBlk = NewnumRemainedBlk;
    846       } else {
    847         MakeProgress = false;
    848         LLVM_DEBUG(dbgs() << "No progress\n";);
    849       }
    850     }
    851   } while (!Finish && MakeProgress);
    852 
    853   // Misc wrap up to maintain the consistency of the Function representation.
    854   wrapup(*GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
    855 
    856   // Detach retired Block, release memory.
    857   for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end();
    858       It != E; ++It) {
    859     if ((*It).second && (*It).second->IsRetired) {
    860       assert(((*It).first)->getNumber() != -1);
    861       LLVM_DEBUG(dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";);
    862       (*It).first->eraseFromParent();  //Remove from the parent Function.
    863     }
    864     delete (*It).second;
    865   }
    866   BlockInfoMap.clear();
    867   LLInfoMap.clear();
    868 
    869   if (!Finish) {
    870     LLVM_DEBUG(FuncRep->viewCFG());
    871     report_fatal_error("IRREDUCIBLE_CFG");
    872   }
    873 
    874   return true;
    875 }
    876 
    877 void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
    878   int SccNum = 0;
    879   MachineBasicBlock *MBB;
    880   for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd();
    881        ++It, ++SccNum) {
    882     const std::vector<MachineBasicBlock *> &SccNext = *It;
    883     for (std::vector<MachineBasicBlock *>::const_iterator
    884          blockIter = SccNext.begin(), blockEnd = SccNext.end();
    885          blockIter != blockEnd; ++blockIter) {
    886       MBB = *blockIter;
    887       OrderedBlks.push_back(MBB);
    888       recordSccnum(MBB, SccNum);
    889     }
    890   }
    891 
    892   // walk through all the block in func to check for unreachable
    893   for (auto *MBB : nodes(MF)) {
    894     SccNum = getSCCNum(MBB);
    895     if (SccNum == INVALIDSCCNUM)
    896       dbgs() << "unreachable block BB" << MBB->getNumber() << "\n";
    897   }
    898 }
    899 
    900 int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
    901   int NumMatch = 0;
    902   int CurMatch;
    903 
    904   LLVM_DEBUG(dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";);
    905 
    906   while ((CurMatch = patternMatchGroup(MBB)) > 0)
    907     NumMatch += CurMatch;
    908 
    909   LLVM_DEBUG(dbgs() << "End patternMatch BB" << MBB->getNumber()
    910                     << ", numMatch = " << NumMatch << "\n";);
    911 
    912   return NumMatch;
    913 }
    914 
    915 int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) {
    916   int NumMatch = 0;
    917   NumMatch += loopendPatternMatch();
    918   NumMatch += serialPatternMatch(MBB);
    919   NumMatch += ifPatternMatch(MBB);
    920   return NumMatch;
    921 }
    922 
    923 int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) {
    924   if (MBB->succ_size() != 1)
    925     return 0;
    926 
    927   MachineBasicBlock *childBlk = *MBB->succ_begin();
    928   if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk))
    929     return 0;
    930 
    931   mergeSerialBlock(MBB, childBlk);
    932   ++numSerialPatternMatch;
    933   return 1;
    934 }
    935 
    936 int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
    937   //two edges
    938   if (MBB->succ_size() != 2)
    939     return 0;
    940   if (hasBackEdge(MBB))
    941     return 0;
    942   MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
    943   if (!BranchMI)
    944     return 0;
    945 
    946   assert(isCondBranch(BranchMI));
    947   int NumMatch = 0;
    948 
    949   MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI);
    950   NumMatch += serialPatternMatch(TrueMBB);
    951   NumMatch += ifPatternMatch(TrueMBB);
    952   MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI);
    953   NumMatch += serialPatternMatch(FalseMBB);
    954   NumMatch += ifPatternMatch(FalseMBB);
    955   MachineBasicBlock *LandBlk;
    956   int Cloned = 0;
    957 
    958   assert (!TrueMBB->succ_empty() || !FalseMBB->succ_empty());
    959   // TODO: Simplify
    960   if (TrueMBB->succ_size() == 1 && FalseMBB->succ_size() == 1
    961     && *TrueMBB->succ_begin() == *FalseMBB->succ_begin()) {
    962     // Diamond pattern
    963     LandBlk = *TrueMBB->succ_begin();
    964   } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) {
    965     // Triangle pattern, false is empty
    966     LandBlk = FalseMBB;
    967     FalseMBB = nullptr;
    968   } else if (FalseMBB->succ_size() == 1
    969              && *FalseMBB->succ_begin() == TrueMBB) {
    970     // Triangle pattern, true is empty
    971     // We reverse the predicate to make a triangle, empty false pattern;
    972     std::swap(TrueMBB, FalseMBB);
    973     reversePredicateSetter(MBB->end(), *MBB);
    974     LandBlk = FalseMBB;
    975     FalseMBB = nullptr;
    976   } else if (FalseMBB->succ_size() == 1
    977              && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) {
    978     LandBlk = *FalseMBB->succ_begin();
    979   } else if (TrueMBB->succ_size() == 1
    980     && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) {
    981     LandBlk = *TrueMBB->succ_begin();
    982   } else {
    983     return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB);
    984   }
    985 
    986   // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
    987   // new BB created for landBlk==NULL may introduce new challenge to the
    988   // reduction process.
    989   if (LandBlk &&
    990       ((TrueMBB && TrueMBB->pred_size() > 1)
    991       || (FalseMBB && FalseMBB->pred_size() > 1))) {
    992      Cloned += improveSimpleJumpintoIf(MBB, TrueMBB, FalseMBB, &LandBlk);
    993   }
    994 
    995   if (TrueMBB && TrueMBB->pred_size() > 1) {
    996     TrueMBB = cloneBlockForPredecessor(TrueMBB, MBB);
    997     ++Cloned;
    998   }
    999 
   1000   if (FalseMBB && FalseMBB->pred_size() > 1) {
   1001     FalseMBB = cloneBlockForPredecessor(FalseMBB, MBB);
   1002     ++Cloned;
   1003   }
   1004 
   1005   mergeIfthenelseBlock(BranchMI, MBB, TrueMBB, FalseMBB, LandBlk);
   1006 
   1007   ++numIfPatternMatch;
   1008 
   1009   numClonedBlock += Cloned;
   1010 
   1011   return 1 + Cloned + NumMatch;
   1012 }
   1013 
   1014 int AMDGPUCFGStructurizer::loopendPatternMatch() {
   1015   std::deque<MachineLoop *> NestedLoops;
   1016   for (auto &It: *MLI)
   1017     for (MachineLoop *ML : depth_first(It))
   1018       NestedLoops.push_front(ML);
   1019 
   1020   if (NestedLoops.empty())
   1021     return 0;
   1022 
   1023   // Process nested loop outside->inside (we did push_front),
   1024   // so "continue" to a outside loop won't be mistaken as "break"
   1025   // of the current loop.
   1026   int Num = 0;
   1027   for (MachineLoop *ExaminedLoop : NestedLoops) {
   1028     if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop])
   1029       continue;
   1030     LLVM_DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump(););
   1031     int NumBreak = mergeLoop(ExaminedLoop);
   1032     if (NumBreak == -1)
   1033       break;
   1034     Num += NumBreak;
   1035   }
   1036   return Num;
   1037 }
   1038 
   1039 int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
   1040   MachineBasicBlock *LoopHeader = LoopRep->getHeader();
   1041   MBBVector ExitingMBBs;
   1042   LoopRep->getExitingBlocks(ExitingMBBs);
   1043   assert(!ExitingMBBs.empty() && "Infinite Loop not supported");
   1044   LLVM_DEBUG(dbgs() << "Loop has " << ExitingMBBs.size()
   1045                     << " exiting blocks\n";);
   1046   // We assume a single ExitBlk
   1047   MBBVector ExitBlks;
   1048   LoopRep->getExitBlocks(ExitBlks);
   1049   SmallPtrSet<MachineBasicBlock *, 2> ExitBlkSet;
   1050   for (unsigned i = 0, e = ExitBlks.size(); i < e; ++i)
   1051     ExitBlkSet.insert(ExitBlks[i]);
   1052   assert(ExitBlkSet.size() == 1);
   1053   MachineBasicBlock *ExitBlk = *ExitBlks.begin();
   1054   assert(ExitBlk && "Loop has several exit block");
   1055   MBBVector LatchBlks;
   1056   for (auto *LB : inverse_children<MachineBasicBlock*>(LoopHeader))
   1057     if (LoopRep->contains(LB))
   1058       LatchBlks.push_back(LB);
   1059 
   1060   for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i)
   1061     mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk);
   1062   for (unsigned i = 0, e = LatchBlks.size(); i < e; ++i)
   1063     settleLoopcontBlock(LatchBlks[i], LoopHeader);
   1064   int Match = 0;
   1065   do {
   1066     Match = 0;
   1067     Match += serialPatternMatch(LoopHeader);
   1068     Match += ifPatternMatch(LoopHeader);
   1069   } while (Match > 0);
   1070   mergeLooplandBlock(LoopHeader, ExitBlk);
   1071   MachineLoop *ParentLoop = LoopRep->getParentLoop();
   1072   if (ParentLoop)
   1073     MLI->changeLoopFor(LoopHeader, ParentLoop);
   1074   else
   1075     MLI->removeBlock(LoopHeader);
   1076   Visited[LoopRep] = true;
   1077   return 1;
   1078 }
   1079 
   1080 bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
   1081     MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) {
   1082   if (Src1MBB->succ_size() == 0) {
   1083     MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB);
   1084     if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) {
   1085       MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep];
   1086       if (TheEntry) {
   1087         LLVM_DEBUG(dbgs() << "isLoopContBreakBlock yes src1 = BB"
   1088                           << Src1MBB->getNumber() << " src2 = BB"
   1089                           << Src2MBB->getNumber() << "\n";);
   1090         return true;
   1091       }
   1092     }
   1093   }
   1094   return false;
   1095 }
   1096 
   1097 int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
   1098     MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
   1099   int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB);
   1100   if (Num == 0) {
   1101     LLVM_DEBUG(dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk"
   1102                       << "\n";);
   1103     Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB);
   1104   }
   1105   return Num;
   1106 }
   1107 
   1108 int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
   1109     MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
   1110   int Num = 0;
   1111   MachineBasicBlock *DownBlk;
   1112 
   1113   //trueBlk could be the common post dominator
   1114   DownBlk = TrueMBB;
   1115 
   1116   LLVM_DEBUG(dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber()
   1117                     << " true = BB" << TrueMBB->getNumber()
   1118                     << ", numSucc=" << TrueMBB->succ_size() << " false = BB"
   1119                     << FalseMBB->getNumber() << "\n";);
   1120 
   1121   while (DownBlk) {
   1122     LLVM_DEBUG(dbgs() << "check down = BB" << DownBlk->getNumber(););
   1123 
   1124     if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) {
   1125       LLVM_DEBUG(dbgs() << " working\n";);
   1126 
   1127       Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk);
   1128       Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk);
   1129 
   1130       numClonedBlock += Num;
   1131       Num += serialPatternMatch(*HeadMBB->succ_begin());
   1132       Num += serialPatternMatch(*std::next(HeadMBB->succ_begin()));
   1133       Num += ifPatternMatch(HeadMBB);
   1134       assert(Num > 0);
   1135 
   1136       break;
   1137     }
   1138     LLVM_DEBUG(dbgs() << " not working\n";);
   1139     DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr;
   1140   } // walk down the postDomTree
   1141 
   1142   return Num;
   1143 }
   1144 
   1145 #ifndef NDEBUG
   1146 void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
   1147     MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB,
   1148     MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) {
   1149   dbgs() << "head = BB" << HeadMBB->getNumber()
   1150          << " size = " << HeadMBB->size();
   1151   if (Detail) {
   1152     dbgs() << "\n";
   1153     HeadMBB->print(dbgs());
   1154     dbgs() << "\n";
   1155   }
   1156 
   1157   if (TrueMBB) {
   1158     dbgs() << ", true = BB" << TrueMBB->getNumber() << " size = "
   1159            << TrueMBB->size() << " numPred = " << TrueMBB->pred_size();
   1160     if (Detail) {
   1161       dbgs() << "\n";
   1162       TrueMBB->print(dbgs());
   1163       dbgs() << "\n";
   1164     }
   1165   }
   1166   if (FalseMBB) {
   1167     dbgs() << ", false = BB" << FalseMBB->getNumber() << " size = "
   1168            << FalseMBB->size() << " numPred = " << FalseMBB->pred_size();
   1169     if (Detail) {
   1170       dbgs() << "\n";
   1171       FalseMBB->print(dbgs());
   1172       dbgs() << "\n";
   1173     }
   1174   }
   1175   if (LandMBB) {
   1176     dbgs() << ", land = BB" << LandMBB->getNumber() << " size = "
   1177            << LandMBB->size() << " numPred = " << LandMBB->pred_size();
   1178     if (Detail) {
   1179       dbgs() << "\n";
   1180       LandMBB->print(dbgs());
   1181       dbgs() << "\n";
   1182     }
   1183   }
   1184 
   1185   dbgs() << "\n";
   1186 }
   1187 #endif
   1188 
   1189 int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
   1190     MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
   1191     MachineBasicBlock **LandMBBPtr) {
   1192   bool MigrateTrue = false;
   1193   bool MigrateFalse = false;
   1194 
   1195   MachineBasicBlock *LandBlk = *LandMBBPtr;
   1196 
   1197   assert((!TrueMBB || TrueMBB->succ_size() <= 1)
   1198          && (!FalseMBB || FalseMBB->succ_size() <= 1));
   1199 
   1200   if (TrueMBB == FalseMBB)
   1201     return 0;
   1202 
   1203   MigrateTrue = needMigrateBlock(TrueMBB);
   1204   MigrateFalse = needMigrateBlock(FalseMBB);
   1205 
   1206   if (!MigrateTrue && !MigrateFalse)
   1207     return 0;
   1208 
   1209   // If we need to migrate either trueBlk and falseBlk, migrate the rest that
   1210   // have more than one predecessors.  without doing this, its predecessor
   1211   // rather than headBlk will have undefined value in initReg.
   1212   if (!MigrateTrue && TrueMBB && TrueMBB->pred_size() > 1)
   1213     MigrateTrue = true;
   1214   if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1)
   1215     MigrateFalse = true;
   1216 
   1217   LLVM_DEBUG(
   1218       dbgs() << "before improveSimpleJumpintoIf: ";
   1219       showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0););
   1220 
   1221   // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
   1222   //
   1223   // new: headBlk => if () {initReg = 1; org trueBlk branch} else
   1224   //      {initReg = 0; org falseBlk branch }
   1225   //      => landBlk => if (initReg) {org trueBlk} else {org falseBlk}
   1226   //      => org landBlk
   1227   //      if landBlk->pred_size() > 2, put the about if-else inside
   1228   //      if (initReg !=2) {...}
   1229   //
   1230   // add initReg = initVal to headBlk
   1231 
   1232   const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
   1233   if (!MigrateTrue || !MigrateFalse) {
   1234     // XXX: We have an opportunity here to optimize the "branch into if" case
   1235     // here.  Branch into if looks like this:
   1236     //                        entry
   1237     //                       /     |
   1238     //           diamond_head       branch_from
   1239     //             /      \           |
   1240     // diamond_false        diamond_true
   1241     //             \      /
   1242     //               done
   1243     //
   1244     // The diamond_head block begins the "if" and the diamond_true block
   1245     // is the block being "branched into".
   1246     //
   1247     // If MigrateTrue is true, then TrueBB is the block being "branched into"
   1248     // and if MigrateFalse is true, then FalseBB is the block being
   1249     // "branched into"
   1250     //
   1251     // Here is the pseudo code for how I think the optimization should work:
   1252     // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head.
   1253     // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
   1254     // 3. Move the branch instruction from diamond_head into its own basic
   1255     //    block (new_block).
   1256     // 4. Add an unconditional branch from diamond_head to new_block
   1257     // 5. Replace the branch instruction in branch_from with an unconditional
   1258     //    branch to new_block.  If branch_from has multiple predecessors, then
   1259     //    we need to replace the True/False block in the branch
   1260     //    instruction instead of replacing it.
   1261     // 6. Change the condition of the branch instruction in new_block from
   1262     //    COND to (COND || GPR0)
   1263     //
   1264     // In order insert these MOV instruction, we will need to use the
   1265     // RegisterScavenger.  Usually liveness stops being tracked during
   1266     // the late machine optimization passes, however if we implement
   1267     // bool TargetRegisterInfo::requiresRegisterScavenging(
   1268     //                                                const MachineFunction &MF)
   1269     // and have it return true, liveness will be tracked correctly
   1270     // by generic optimization passes.  We will also need to make sure that
   1271     // all of our target-specific passes that run after regalloc and before
   1272     // the CFGStructurizer track liveness and we will need to modify this pass
   1273     // to correctly track liveness.
   1274     //
   1275     // After the above changes, the new CFG should look like this:
   1276     //                        entry
   1277     //                       /     |
   1278     //           diamond_head       branch_from
   1279     //                       \     /
   1280     //                      new_block
   1281     //                      /      |
   1282     //         diamond_false        diamond_true
   1283     //                      \      /
   1284     //                        done
   1285     //
   1286     // Without this optimization, we are forced to duplicate the diamond_true
   1287     // block and we will end up with a CFG like this:
   1288     //
   1289     //                        entry
   1290     //                       /     |
   1291     //           diamond_head       branch_from
   1292     //             /      \                   |
   1293     // diamond_false        diamond_true      diamond_true (duplicate)
   1294     //             \      /                   |
   1295     //               done --------------------|
   1296     //
   1297     // Duplicating diamond_true can be very costly especially if it has a
   1298     // lot of instructions.
   1299     return 0;
   1300   }
   1301 
   1302   int NumNewBlk = 0;
   1303 
   1304   bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
   1305 
   1306   //insert R600::ENDIF to avoid special case "input landBlk == NULL"
   1307   MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, R600::ENDIF);
   1308 
   1309   if (LandBlkHasOtherPred) {
   1310     report_fatal_error("Extra register needed to handle CFG");
   1311     unsigned CmpResReg =
   1312       HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
   1313     report_fatal_error("Extra compare instruction needed to handle CFG");
   1314     insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET,
   1315         CmpResReg, DebugLoc());
   1316   }
   1317 
   1318   // XXX: We are running this after RA, so creating virtual registers will
   1319   // cause an assertion failure in the PostRA scheduling pass.
   1320   unsigned InitReg =
   1321     HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
   1322   insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET, InitReg,
   1323       DebugLoc());
   1324 
   1325   if (MigrateTrue) {
   1326     migrateInstruction(TrueMBB, LandBlk, I);
   1327     // need to uncondionally insert the assignment to ensure a path from its
   1328     // predecessor rather than headBlk has valid value in initReg if
   1329     // (initVal != 1).
   1330     report_fatal_error("Extra register needed to handle CFG");
   1331   }
   1332   insertInstrBefore(I, R600::ELSE);
   1333 
   1334   if (MigrateFalse) {
   1335     migrateInstruction(FalseMBB, LandBlk, I);
   1336     // need to uncondionally insert the assignment to ensure a path from its
   1337     // predecessor rather than headBlk has valid value in initReg if
   1338     // (initVal != 0)
   1339     report_fatal_error("Extra register needed to handle CFG");
   1340   }
   1341 
   1342   if (LandBlkHasOtherPred) {
   1343     // add endif
   1344     insertInstrBefore(I, R600::ENDIF);
   1345 
   1346     // put initReg = 2 to other predecessors of landBlk
   1347     for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(),
   1348          PE = LandBlk->pred_end(); PI != PE; ++PI) {
   1349       MachineBasicBlock *MBB = *PI;
   1350       if (MBB != TrueMBB && MBB != FalseMBB)
   1351         report_fatal_error("Extra register needed to handle CFG");
   1352     }
   1353   }
   1354   LLVM_DEBUG(
   1355       dbgs() << "result from improveSimpleJumpintoIf: ";
   1356       showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0););
   1357 
   1358   // update landBlk
   1359   *LandMBBPtr = LandBlk;
   1360 
   1361   return NumNewBlk;
   1362 }
   1363 
   1364 void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
   1365     MachineBasicBlock *SrcMBB) {
   1366   LLVM_DEBUG(dbgs() << "serialPattern BB" << DstMBB->getNumber() << " <= BB"
   1367                     << SrcMBB->getNumber() << "\n";);
   1368   DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end());
   1369 
   1370   DstMBB->removeSuccessor(SrcMBB, true);
   1371   cloneSuccessorList(DstMBB, SrcMBB);
   1372 
   1373   removeSuccessor(SrcMBB);
   1374   MLI->removeBlock(SrcMBB);
   1375   retireBlock(SrcMBB);
   1376 }
   1377 
   1378 void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
   1379     MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
   1380     MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) {
   1381   assert (TrueMBB);
   1382   LLVM_DEBUG(dbgs() << "ifPattern BB" << MBB->getNumber(); dbgs() << "{  ";
   1383              if (TrueMBB) { dbgs() << "BB" << TrueMBB->getNumber(); } dbgs()
   1384              << "  } else ";
   1385              dbgs() << "{  "; if (FalseMBB) {
   1386                dbgs() << "BB" << FalseMBB->getNumber();
   1387              } dbgs() << "  }\n ";
   1388              dbgs() << "landBlock: "; if (!LandMBB) { dbgs() << "NULL"; } else {
   1389                dbgs() << "BB" << LandMBB->getNumber();
   1390              } dbgs() << "\n";);
   1391 
   1392   int OldOpcode = BranchMI->getOpcode();
   1393   DebugLoc BranchDL = BranchMI->getDebugLoc();
   1394 
   1395 //    transform to
   1396 //    if cond
   1397 //       trueBlk
   1398 //    else
   1399 //       falseBlk
   1400 //    endif
   1401 //    landBlk
   1402 
   1403   MachineBasicBlock::iterator I = BranchMI;
   1404   insertCondBranchBefore(I, getBranchNzeroOpcode(OldOpcode),
   1405       BranchDL);
   1406 
   1407   if (TrueMBB) {
   1408     MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end());
   1409     MBB->removeSuccessor(TrueMBB, true);
   1410     if (LandMBB && TrueMBB->succ_size()!=0)
   1411       TrueMBB->removeSuccessor(LandMBB, true);
   1412     retireBlock(TrueMBB);
   1413     MLI->removeBlock(TrueMBB);
   1414   }
   1415 
   1416   if (FalseMBB) {
   1417     insertInstrBefore(I, R600::ELSE);
   1418     MBB->splice(I, FalseMBB, FalseMBB->begin(),
   1419                    FalseMBB->end());
   1420     MBB->removeSuccessor(FalseMBB, true);
   1421     if (LandMBB && FalseMBB->succ_size() != 0)
   1422       FalseMBB->removeSuccessor(LandMBB, true);
   1423     retireBlock(FalseMBB);
   1424     MLI->removeBlock(FalseMBB);
   1425   }
   1426   insertInstrBefore(I, R600::ENDIF);
   1427 
   1428   BranchMI->eraseFromParent();
   1429 
   1430   if (LandMBB && TrueMBB && FalseMBB)
   1431     MBB->addSuccessor(LandMBB);
   1432 }
   1433 
   1434 void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
   1435     MachineBasicBlock *LandMBB) {
   1436   LLVM_DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
   1437                     << " land = BB" << LandMBB->getNumber() << "\n";);
   1438 
   1439   insertInstrBefore(DstBlk, R600::WHILELOOP, DebugLoc());
   1440   insertInstrEnd(DstBlk, R600::ENDLOOP, DebugLoc());
   1441   DstBlk->replaceSuccessor(DstBlk, LandMBB);
   1442 }
   1443 
   1444 void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
   1445     MachineBasicBlock *LandMBB) {
   1446   LLVM_DEBUG(dbgs() << "loopbreakPattern exiting = BB"
   1447                     << ExitingMBB->getNumber() << " land = BB"
   1448                     << LandMBB->getNumber() << "\n";);
   1449   MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB);
   1450   assert(BranchMI && isCondBranch(BranchMI));
   1451   DebugLoc DL = BranchMI->getDebugLoc();
   1452   MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI);
   1453   MachineBasicBlock::iterator I = BranchMI;
   1454   if (TrueBranch != LandMBB)
   1455     reversePredicateSetter(I, *I->getParent());
   1456   insertCondBranchBefore(ExitingMBB, I, R600::IF_PREDICATE_SET, R600::PREDICATE_BIT, DL);
   1457   insertInstrBefore(I, R600::BREAK);
   1458   insertInstrBefore(I, R600::ENDIF);
   1459   //now branchInst can be erase safely
   1460   BranchMI->eraseFromParent();
   1461   //now take care of successors, retire blocks
   1462   ExitingMBB->removeSuccessor(LandMBB, true);
   1463 }
   1464 
   1465 void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
   1466     MachineBasicBlock *ContMBB) {
   1467   LLVM_DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
   1468                     << ContingMBB->getNumber() << ", cont = BB"
   1469                     << ContMBB->getNumber() << "\n";);
   1470 
   1471   MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB);
   1472   if (MI) {
   1473     assert(isCondBranch(MI));
   1474     MachineBasicBlock::iterator I = MI;
   1475     MachineBasicBlock *TrueBranch = getTrueBranch(MI);
   1476     int OldOpcode = MI->getOpcode();
   1477     DebugLoc DL = MI->getDebugLoc();
   1478 
   1479     bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI);
   1480 
   1481     if (!UseContinueLogical) {
   1482       int BranchOpcode =
   1483           TrueBranch == ContMBB ? getBranchNzeroOpcode(OldOpcode) :
   1484           getBranchZeroOpcode(OldOpcode);
   1485       insertCondBranchBefore(I, BranchOpcode, DL);
   1486       // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
   1487       insertInstrEnd(ContingMBB, R600::CONTINUE, DL);
   1488       insertInstrEnd(ContingMBB, R600::ENDIF, DL);
   1489     } else {
   1490       int BranchOpcode =
   1491           TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) :
   1492           getContinueZeroOpcode(OldOpcode);
   1493       insertCondBranchBefore(I, BranchOpcode, DL);
   1494     }
   1495 
   1496     MI->eraseFromParent();
   1497   } else {
   1498     // if we've arrived here then we've already erased the branch instruction
   1499     // travel back up the basic block to see the last reference of our debug
   1500     // location we've just inserted that reference here so it should be
   1501     // representative insertEnd to ensure phi-moves, if exist, go before the
   1502     // continue-instr.
   1503     insertInstrEnd(ContingMBB, R600::CONTINUE,
   1504         getLastDebugLocInBB(ContingMBB));
   1505   }
   1506 }
   1507 
   1508 int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
   1509     MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) {
   1510   int Cloned = 0;
   1511   assert(PreMBB->isSuccessor(SrcMBB));
   1512   while (SrcMBB && SrcMBB != DstMBB) {
   1513     assert(SrcMBB->succ_size() == 1);
   1514     if (SrcMBB->pred_size() > 1) {
   1515       SrcMBB = cloneBlockForPredecessor(SrcMBB, PreMBB);
   1516       ++Cloned;
   1517     }
   1518 
   1519     PreMBB = SrcMBB;
   1520     SrcMBB = *SrcMBB->succ_begin();
   1521   }
   1522 
   1523   return Cloned;
   1524 }
   1525 
   1526 MachineBasicBlock *
   1527 AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
   1528     MachineBasicBlock *PredMBB) {
   1529   assert(PredMBB->isSuccessor(MBB) &&
   1530          "succBlk is not a prececessor of curBlk");
   1531 
   1532   MachineBasicBlock *CloneMBB = clone(MBB);  //clone instructions
   1533   replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
   1534   //srcBlk, oldBlk, newBlk
   1535 
   1536   PredMBB->replaceSuccessor(MBB, CloneMBB);
   1537 
   1538   // add all successor to cloneBlk
   1539   cloneSuccessorList(CloneMBB, MBB);
   1540 
   1541   numClonedInstr += MBB->size();
   1542 
   1543   LLVM_DEBUG(dbgs() << "Cloned block: "
   1544                     << "BB" << MBB->getNumber() << "size " << MBB->size()
   1545                     << "\n";);
   1546 
   1547   SHOWNEWBLK(CloneMBB, "result of Cloned block: ");
   1548 
   1549   return CloneMBB;
   1550 }
   1551 
   1552 void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
   1553     MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) {
   1554   MachineBasicBlock::iterator SpliceEnd;
   1555   //look for the input branchinstr, not the AMDGPU branchinstr
   1556   MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB);
   1557   if (!BranchMI) {
   1558     LLVM_DEBUG(dbgs() << "migrateInstruction don't see branch instr\n";);
   1559     SpliceEnd = SrcMBB->end();
   1560   } else {
   1561     LLVM_DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI);
   1562     SpliceEnd = BranchMI;
   1563   }
   1564   LLVM_DEBUG(dbgs() << "migrateInstruction before splice dstSize = "
   1565                     << DstMBB->size() << "srcSize = " << SrcMBB->size()
   1566                     << "\n";);
   1567 
   1568   //splice insert before insertPos
   1569   DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd);
   1570 
   1571   LLVM_DEBUG(dbgs() << "migrateInstruction after splice dstSize = "
   1572                     << DstMBB->size() << "srcSize = " << SrcMBB->size()
   1573                     << '\n';);
   1574 }
   1575 
   1576 MachineBasicBlock *
   1577 AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
   1578   MachineBasicBlock *LoopHeader = LoopRep->getHeader();
   1579   MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch();
   1580 
   1581   if (!LoopHeader || !LoopLatch)
   1582     return nullptr;
   1583   MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch);
   1584   // Is LoopRep an infinite loop ?
   1585   if (!BranchMI || !isUncondBranch(BranchMI))
   1586     return nullptr;
   1587 
   1588   MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
   1589   FuncRep->push_back(DummyExitBlk);  //insert to function
   1590   SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
   1591   LLVM_DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
   1592   LLVMContext &Ctx = LoopHeader->getParent()->getFunction().getContext();
   1593   Ctx.emitError("Extra register needed to handle CFG");
   1594   return nullptr;
   1595 }
   1596 
   1597 void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
   1598   MachineInstr *BranchMI;
   1599 
   1600   // I saw two unconditional branch in one basic block in example
   1601   // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
   1602   while ((BranchMI = getLoopendBlockBranchInstr(MBB))
   1603           && isUncondBranch(BranchMI)) {
   1604     LLVM_DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI);
   1605     BranchMI->eraseFromParent();
   1606   }
   1607 }
   1608 
   1609 void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
   1610     MachineBasicBlock *MBB) {
   1611   if (MBB->succ_size() != 2)
   1612     return;
   1613   MachineBasicBlock *MBB1 = *MBB->succ_begin();
   1614   MachineBasicBlock *MBB2 = *std::next(MBB->succ_begin());
   1615   if (MBB1 != MBB2)
   1616     return;
   1617 
   1618   MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
   1619   assert(BranchMI && isCondBranch(BranchMI));
   1620   LLVM_DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI);
   1621   BranchMI->eraseFromParent();
   1622   SHOWNEWBLK(MBB1, "Removing redundant successor");
   1623   MBB->removeSuccessor(MBB1, true);
   1624 }
   1625 
   1626 void AMDGPUCFGStructurizer::addDummyExitBlock(
   1627     SmallVectorImpl<MachineBasicBlock*> &RetMBB) {
   1628   MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
   1629   FuncRep->push_back(DummyExitBlk);  //insert to function
   1630   insertInstrEnd(DummyExitBlk, R600::RETURN);
   1631 
   1632   for (SmallVectorImpl<MachineBasicBlock *>::iterator It = RetMBB.begin(),
   1633        E = RetMBB.end(); It != E; ++It) {
   1634     MachineBasicBlock *MBB = *It;
   1635     MachineInstr *MI = getReturnInstr(MBB);
   1636     if (MI)
   1637       MI->eraseFromParent();
   1638     MBB->addSuccessor(DummyExitBlk);
   1639     LLVM_DEBUG(dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
   1640                       << " successors\n";);
   1641   }
   1642   SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: ");
   1643 }
   1644 
   1645 void AMDGPUCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) {
   1646   while (MBB->succ_size())
   1647     MBB->removeSuccessor(*MBB->succ_begin());
   1648 }
   1649 
   1650 void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
   1651     int SccNum) {
   1652   BlockInformation *&srcBlkInfo = BlockInfoMap[MBB];
   1653   if (!srcBlkInfo)
   1654     srcBlkInfo = new BlockInformation();
   1655   srcBlkInfo->SccNum = SccNum;
   1656 }
   1657 
   1658 void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
   1659   LLVM_DEBUG(dbgs() << "Retiring BB" << MBB->getNumber() << "\n";);
   1660 
   1661   BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB];
   1662 
   1663   if (!SrcBlkInfo)
   1664     SrcBlkInfo = new BlockInformation();
   1665 
   1666   SrcBlkInfo->IsRetired = true;
   1667   assert(MBB->succ_size() == 0 && MBB->pred_size() == 0
   1668          && "can't retire block yet");
   1669 }
   1670 
   1671 INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer",
   1672                       "AMDGPU CFG Structurizer", false, false)
   1673 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
   1674 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
   1675 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
   1676 INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer",
   1677                       "AMDGPU CFG Structurizer", false, false)
   1678 
   1679 FunctionPass *llvm::createAMDGPUCFGStructurizerPass() {
   1680   return new AMDGPUCFGStructurizer();
   1681 }
   1682