Home | History | Annotate | Download | only in R600
      1 //===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 /// This pass compute turns all control flow pseudo instructions into native one
     12 /// computing their address on the fly ; it also sets STACK_SIZE info.
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "llvm/Support/Debug.h"
     16 #include "AMDGPU.h"
     17 #include "AMDGPUSubtarget.h"
     18 #include "R600Defines.h"
     19 #include "R600InstrInfo.h"
     20 #include "R600MachineFunctionInfo.h"
     21 #include "R600RegisterInfo.h"
     22 #include "llvm/CodeGen/MachineFunctionPass.h"
     23 #include "llvm/CodeGen/MachineInstrBuilder.h"
     24 #include "llvm/CodeGen/MachineRegisterInfo.h"
     25 #include "llvm/Support/raw_ostream.h"
     26 
     27 using namespace llvm;
     28 
     29 #define DEBUG_TYPE "r600cf"
     30 
     31 namespace {
     32 
     33 struct CFStack {
     34 
     35   enum StackItem {
     36     ENTRY = 0,
     37     SUB_ENTRY = 1,
     38     FIRST_NON_WQM_PUSH = 2,
     39     FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
     40   };
     41 
     42   const AMDGPUSubtarget &ST;
     43   std::vector<StackItem> BranchStack;
     44   std::vector<StackItem> LoopStack;
     45   unsigned MaxStackSize;
     46   unsigned CurrentEntries;
     47   unsigned CurrentSubEntries;
     48 
     49   CFStack(const AMDGPUSubtarget &st, unsigned ShaderType) : ST(st),
     50       // We need to reserve a stack entry for CALL_FS in vertex shaders.
     51       MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0),
     52       CurrentEntries(0), CurrentSubEntries(0) { }
     53 
     54   unsigned getLoopDepth();
     55   bool branchStackContains(CFStack::StackItem);
     56   bool requiresWorkAroundForInst(unsigned Opcode);
     57   unsigned getSubEntrySize(CFStack::StackItem Item);
     58   void updateMaxStackSize();
     59   void pushBranch(unsigned Opcode, bool isWQM = false);
     60   void pushLoop();
     61   void popBranch();
     62   void popLoop();
     63 };
     64 
     65 unsigned CFStack::getLoopDepth() {
     66   return LoopStack.size();
     67 }
     68 
     69 bool CFStack::branchStackContains(CFStack::StackItem Item) {
     70   for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(),
     71        E = BranchStack.end(); I != E; ++I) {
     72     if (*I == Item)
     73       return true;
     74   }
     75   return false;
     76 }
     77 
     78 bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
     79   if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST.hasCaymanISA() &&
     80       getLoopDepth() > 1)
     81     return true;
     82 
     83   if (!ST.hasCFAluBug())
     84     return false;
     85 
     86   switch(Opcode) {
     87   default: return false;
     88   case AMDGPU::CF_ALU_PUSH_BEFORE:
     89   case AMDGPU::CF_ALU_ELSE_AFTER:
     90   case AMDGPU::CF_ALU_BREAK:
     91   case AMDGPU::CF_ALU_CONTINUE:
     92     if (CurrentSubEntries == 0)
     93       return false;
     94     if (ST.getWavefrontSize() == 64) {
     95       // We are being conservative here.  We only require this work-around if
     96       // CurrentSubEntries > 3 &&
     97       // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0)
     98       //
     99       // We have to be conservative, because we don't know for certain that
    100       // our stack allocation algorithm for Evergreen/NI is correct.  Applying this
    101       // work-around when CurrentSubEntries > 3 allows us to over-allocate stack
    102       // resources without any problems.
    103       return CurrentSubEntries > 3;
    104     } else {
    105       assert(ST.getWavefrontSize() == 32);
    106       // We are being conservative here.  We only require the work-around if
    107       // CurrentSubEntries > 7 &&
    108       // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0)
    109       // See the comment on the wavefront size == 64 case for why we are
    110       // being conservative.
    111       return CurrentSubEntries > 7;
    112     }
    113   }
    114 }
    115 
    116 unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
    117   switch(Item) {
    118   default:
    119     return 0;
    120   case CFStack::FIRST_NON_WQM_PUSH:
    121   assert(!ST.hasCaymanISA());
    122   if (ST.getGeneration() <= AMDGPUSubtarget::R700) {
    123     // +1 For the push operation.
    124     // +2 Extra space required.
    125     return 3;
    126   } else {
    127     // Some documentation says that this is not necessary on Evergreen,
    128     // but experimentation has show that we need to allocate 1 extra
    129     // sub-entry for the first non-WQM push.
    130     // +1 For the push operation.
    131     // +1 Extra space required.
    132     return 2;
    133   }
    134   case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
    135     assert(ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN);
    136     // +1 For the push operation.
    137     // +1 Extra space required.
    138     return 2;
    139   case CFStack::SUB_ENTRY:
    140     return 1;
    141   }
    142 }
    143 
    144 void CFStack::updateMaxStackSize() {
    145   unsigned CurrentStackSize = CurrentEntries +
    146                               (RoundUpToAlignment(CurrentSubEntries, 4) / 4);
    147   MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
    148 }
    149 
    150 void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
    151   CFStack::StackItem Item = CFStack::ENTRY;
    152   switch(Opcode) {
    153   case AMDGPU::CF_PUSH_EG:
    154   case AMDGPU::CF_ALU_PUSH_BEFORE:
    155     if (!isWQM) {
    156       if (!ST.hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
    157         Item = CFStack::FIRST_NON_WQM_PUSH;  // May not be required on Evergreen/NI
    158                                              // See comment in
    159                                              // CFStack::getSubEntrySize()
    160       else if (CurrentEntries > 0 &&
    161                ST.getGeneration() > AMDGPUSubtarget::EVERGREEN &&
    162                !ST.hasCaymanISA() &&
    163                !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
    164         Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
    165       else
    166         Item = CFStack::SUB_ENTRY;
    167     } else
    168       Item = CFStack::ENTRY;
    169     break;
    170   }
    171   BranchStack.push_back(Item);
    172   if (Item == CFStack::ENTRY)
    173     CurrentEntries++;
    174   else
    175     CurrentSubEntries += getSubEntrySize(Item);
    176   updateMaxStackSize();
    177 }
    178 
    179 void CFStack::pushLoop() {
    180   LoopStack.push_back(CFStack::ENTRY);
    181   CurrentEntries++;
    182   updateMaxStackSize();
    183 }
    184 
    185 void CFStack::popBranch() {
    186   CFStack::StackItem Top = BranchStack.back();
    187   if (Top == CFStack::ENTRY)
    188     CurrentEntries--;
    189   else
    190     CurrentSubEntries-= getSubEntrySize(Top);
    191   BranchStack.pop_back();
    192 }
    193 
    194 void CFStack::popLoop() {
    195   CurrentEntries--;
    196   LoopStack.pop_back();
    197 }
    198 
    199 class R600ControlFlowFinalizer : public MachineFunctionPass {
    200 
    201 private:
    202   typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile;
    203 
    204   enum ControlFlowInstruction {
    205     CF_TC,
    206     CF_VC,
    207     CF_CALL_FS,
    208     CF_WHILE_LOOP,
    209     CF_END_LOOP,
    210     CF_LOOP_BREAK,
    211     CF_LOOP_CONTINUE,
    212     CF_JUMP,
    213     CF_ELSE,
    214     CF_POP,
    215     CF_END
    216   };
    217 
    218   static char ID;
    219   const R600InstrInfo *TII;
    220   const R600RegisterInfo *TRI;
    221   unsigned MaxFetchInst;
    222   const AMDGPUSubtarget &ST;
    223 
    224   bool IsTrivialInst(MachineInstr *MI) const {
    225     switch (MI->getOpcode()) {
    226     case AMDGPU::KILL:
    227     case AMDGPU::RETURN:
    228       return true;
    229     default:
    230       return false;
    231     }
    232   }
    233 
    234   const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
    235     unsigned Opcode = 0;
    236     bool isEg = (ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN);
    237     switch (CFI) {
    238     case CF_TC:
    239       Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
    240       break;
    241     case CF_VC:
    242       Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
    243       break;
    244     case CF_CALL_FS:
    245       Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
    246       break;
    247     case CF_WHILE_LOOP:
    248       Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
    249       break;
    250     case CF_END_LOOP:
    251       Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
    252       break;
    253     case CF_LOOP_BREAK:
    254       Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
    255       break;
    256     case CF_LOOP_CONTINUE:
    257       Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
    258       break;
    259     case CF_JUMP:
    260       Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
    261       break;
    262     case CF_ELSE:
    263       Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
    264       break;
    265     case CF_POP:
    266       Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
    267       break;
    268     case CF_END:
    269       if (ST.hasCaymanISA()) {
    270         Opcode = AMDGPU::CF_END_CM;
    271         break;
    272       }
    273       Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
    274       break;
    275     }
    276     assert (Opcode && "No opcode selected");
    277     return TII->get(Opcode);
    278   }
    279 
    280   bool isCompatibleWithClause(const MachineInstr *MI,
    281       std::set<unsigned> &DstRegs) const {
    282     unsigned DstMI, SrcMI;
    283     for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
    284         E = MI->operands_end(); I != E; ++I) {
    285       const MachineOperand &MO = *I;
    286       if (!MO.isReg())
    287         continue;
    288       if (MO.isDef()) {
    289         unsigned Reg = MO.getReg();
    290         if (AMDGPU::R600_Reg128RegClass.contains(Reg))
    291           DstMI = Reg;
    292         else
    293           DstMI = TRI->getMatchingSuperReg(Reg,
    294               TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
    295               &AMDGPU::R600_Reg128RegClass);
    296       }
    297       if (MO.isUse()) {
    298         unsigned Reg = MO.getReg();
    299         if (AMDGPU::R600_Reg128RegClass.contains(Reg))
    300           SrcMI = Reg;
    301         else
    302           SrcMI = TRI->getMatchingSuperReg(Reg,
    303               TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
    304               &AMDGPU::R600_Reg128RegClass);
    305       }
    306     }
    307     if ((DstRegs.find(SrcMI) == DstRegs.end())) {
    308       DstRegs.insert(DstMI);
    309       return true;
    310     } else
    311       return false;
    312   }
    313 
    314   ClauseFile
    315   MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
    316       const {
    317     MachineBasicBlock::iterator ClauseHead = I;
    318     std::vector<MachineInstr *> ClauseContent;
    319     unsigned AluInstCount = 0;
    320     bool IsTex = TII->usesTextureCache(ClauseHead);
    321     std::set<unsigned> DstRegs;
    322     for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
    323       if (IsTrivialInst(I))
    324         continue;
    325       if (AluInstCount >= MaxFetchInst)
    326         break;
    327       if ((IsTex && !TII->usesTextureCache(I)) ||
    328           (!IsTex && !TII->usesVertexCache(I)))
    329         break;
    330       if (!isCompatibleWithClause(I, DstRegs))
    331         break;
    332       AluInstCount ++;
    333       ClauseContent.push_back(I);
    334     }
    335     MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
    336         getHWInstrDesc(IsTex?CF_TC:CF_VC))
    337         .addImm(0) // ADDR
    338         .addImm(AluInstCount - 1); // COUNT
    339     return ClauseFile(MIb, ClauseContent);
    340   }
    341 
    342   void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const {
    343     static const unsigned LiteralRegs[] = {
    344       AMDGPU::ALU_LITERAL_X,
    345       AMDGPU::ALU_LITERAL_Y,
    346       AMDGPU::ALU_LITERAL_Z,
    347       AMDGPU::ALU_LITERAL_W
    348     };
    349     const SmallVector<std::pair<MachineOperand *, int64_t>, 3 > Srcs =
    350         TII->getSrcs(MI);
    351     for (unsigned i = 0, e = Srcs.size(); i < e; ++i) {
    352       if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X)
    353         continue;
    354       int64_t Imm = Srcs[i].second;
    355       std::vector<int64_t>::iterator It =
    356           std::find(Lits.begin(), Lits.end(), Imm);
    357       if (It != Lits.end()) {
    358         unsigned Index = It - Lits.begin();
    359         Srcs[i].first->setReg(LiteralRegs[Index]);
    360       } else {
    361         assert(Lits.size() < 4 && "Too many literals in Instruction Group");
    362         Srcs[i].first->setReg(LiteralRegs[Lits.size()]);
    363         Lits.push_back(Imm);
    364       }
    365     }
    366   }
    367 
    368   MachineBasicBlock::iterator insertLiterals(
    369       MachineBasicBlock::iterator InsertPos,
    370       const std::vector<unsigned> &Literals) const {
    371     MachineBasicBlock *MBB = InsertPos->getParent();
    372     for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
    373       unsigned LiteralPair0 = Literals[i];
    374       unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
    375       InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(),
    376           TII->get(AMDGPU::LITERALS))
    377           .addImm(LiteralPair0)
    378           .addImm(LiteralPair1);
    379     }
    380     return InsertPos;
    381   }
    382 
    383   ClauseFile
    384   MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
    385       const {
    386     MachineBasicBlock::iterator ClauseHead = I;
    387     std::vector<MachineInstr *> ClauseContent;
    388     I++;
    389     for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) {
    390       if (IsTrivialInst(I)) {
    391         ++I;
    392         continue;
    393       }
    394       if (!I->isBundle() && !TII->isALUInstr(I->getOpcode()))
    395         break;
    396       std::vector<int64_t> Literals;
    397       if (I->isBundle()) {
    398         MachineInstr *DeleteMI = I;
    399         MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
    400         while (++BI != E && BI->isBundledWithPred()) {
    401           BI->unbundleFromPred();
    402           for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) {
    403             MachineOperand &MO = BI->getOperand(i);
    404             if (MO.isReg() && MO.isInternalRead())
    405               MO.setIsInternalRead(false);
    406           }
    407           getLiteral(BI, Literals);
    408           ClauseContent.push_back(BI);
    409         }
    410         I = BI;
    411         DeleteMI->eraseFromParent();
    412       } else {
    413         getLiteral(I, Literals);
    414         ClauseContent.push_back(I);
    415         I++;
    416       }
    417       for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
    418         unsigned literal0 = Literals[i];
    419         unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0;
    420         MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(),
    421             TII->get(AMDGPU::LITERALS))
    422             .addImm(literal0)
    423             .addImm(literal2);
    424         ClauseContent.push_back(MILit);
    425       }
    426     }
    427     assert(ClauseContent.size() < 128 && "ALU clause is too big");
    428     ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1);
    429     return ClauseFile(ClauseHead, ClauseContent);
    430   }
    431 
    432   void
    433   EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
    434       unsigned &CfCount) {
    435     CounterPropagateAddr(Clause.first, CfCount);
    436     MachineBasicBlock *BB = Clause.first->getParent();
    437     BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE))
    438         .addImm(CfCount);
    439     for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
    440       BB->splice(InsertPos, BB, Clause.second[i]);
    441     }
    442     CfCount += 2 * Clause.second.size();
    443   }
    444 
    445   void
    446   EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
    447       unsigned &CfCount) {
    448     Clause.first->getOperand(0).setImm(0);
    449     CounterPropagateAddr(Clause.first, CfCount);
    450     MachineBasicBlock *BB = Clause.first->getParent();
    451     BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE))
    452         .addImm(CfCount);
    453     for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
    454       BB->splice(InsertPos, BB, Clause.second[i]);
    455     }
    456     CfCount += Clause.second.size();
    457   }
    458 
    459   void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const {
    460     MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm());
    461   }
    462   void CounterPropagateAddr(std::set<MachineInstr *> MIs, unsigned Addr)
    463       const {
    464     for (std::set<MachineInstr *>::iterator It = MIs.begin(), E = MIs.end();
    465         It != E; ++It) {
    466       MachineInstr *MI = *It;
    467       CounterPropagateAddr(MI, Addr);
    468     }
    469   }
    470 
    471 public:
    472   R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID),
    473     TII (nullptr), TRI(nullptr),
    474     ST(tm.getSubtarget<AMDGPUSubtarget>()) {
    475       const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>();
    476       MaxFetchInst = ST.getTexVTXClauseSize();
    477   }
    478 
    479   bool runOnMachineFunction(MachineFunction &MF) override {
    480     TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
    481     TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo());
    482     R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
    483 
    484     CFStack CFStack(ST, MFI->ShaderType);
    485     for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
    486         ++MB) {
    487       MachineBasicBlock &MBB = *MB;
    488       unsigned CfCount = 0;
    489       std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
    490       std::vector<MachineInstr * > IfThenElseStack;
    491       if (MFI->ShaderType == 1) {
    492         BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
    493             getHWInstrDesc(CF_CALL_FS));
    494         CfCount++;
    495       }
    496       std::vector<ClauseFile> FetchClauses, AluClauses;
    497       std::vector<MachineInstr *> LastAlu(1);
    498       std::vector<MachineInstr *> ToPopAfter;
    499 
    500       for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
    501           I != E;) {
    502         if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) {
    503           DEBUG(dbgs() << CfCount << ":"; I->dump(););
    504           FetchClauses.push_back(MakeFetchClause(MBB, I));
    505           CfCount++;
    506           LastAlu.back() = nullptr;
    507           continue;
    508         }
    509 
    510         MachineBasicBlock::iterator MI = I;
    511         if (MI->getOpcode() != AMDGPU::ENDIF)
    512           LastAlu.back() = nullptr;
    513         if (MI->getOpcode() == AMDGPU::CF_ALU)
    514           LastAlu.back() = MI;
    515         I++;
    516         bool RequiresWorkAround =
    517             CFStack.requiresWorkAroundForInst(MI->getOpcode());
    518         switch (MI->getOpcode()) {
    519         case AMDGPU::CF_ALU_PUSH_BEFORE:
    520           if (RequiresWorkAround) {
    521             DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n");
    522             BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
    523                 .addImm(CfCount + 1)
    524                 .addImm(1);
    525             MI->setDesc(TII->get(AMDGPU::CF_ALU));
    526             CfCount++;
    527             CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
    528           } else
    529             CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
    530 
    531         case AMDGPU::CF_ALU:
    532           I = MI;
    533           AluClauses.push_back(MakeALUClause(MBB, I));
    534           DEBUG(dbgs() << CfCount << ":"; MI->dump(););
    535           CfCount++;
    536           break;
    537         case AMDGPU::WHILELOOP: {
    538           CFStack.pushLoop();
    539           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
    540               getHWInstrDesc(CF_WHILE_LOOP))
    541               .addImm(1);
    542           std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
    543               std::set<MachineInstr *>());
    544           Pair.second.insert(MIb);
    545           LoopStack.push_back(Pair);
    546           MI->eraseFromParent();
    547           CfCount++;
    548           break;
    549         }
    550         case AMDGPU::ENDLOOP: {
    551           CFStack.popLoop();
    552           std::pair<unsigned, std::set<MachineInstr *> > Pair =
    553               LoopStack.back();
    554           LoopStack.pop_back();
    555           CounterPropagateAddr(Pair.second, CfCount);
    556           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP))
    557               .addImm(Pair.first + 1);
    558           MI->eraseFromParent();
    559           CfCount++;
    560           break;
    561         }
    562         case AMDGPU::IF_PREDICATE_SET: {
    563           LastAlu.push_back(nullptr);
    564           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
    565               getHWInstrDesc(CF_JUMP))
    566               .addImm(0)
    567               .addImm(0);
    568           IfThenElseStack.push_back(MIb);
    569           DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
    570           MI->eraseFromParent();
    571           CfCount++;
    572           break;
    573         }
    574         case AMDGPU::ELSE: {
    575           MachineInstr * JumpInst = IfThenElseStack.back();
    576           IfThenElseStack.pop_back();
    577           CounterPropagateAddr(JumpInst, CfCount);
    578           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
    579               getHWInstrDesc(CF_ELSE))
    580               .addImm(0)
    581               .addImm(0);
    582           DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
    583           IfThenElseStack.push_back(MIb);
    584           MI->eraseFromParent();
    585           CfCount++;
    586           break;
    587         }
    588         case AMDGPU::ENDIF: {
    589           CFStack.popBranch();
    590           if (LastAlu.back()) {
    591             ToPopAfter.push_back(LastAlu.back());
    592           } else {
    593             MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
    594                 getHWInstrDesc(CF_POP))
    595                 .addImm(CfCount + 1)
    596                 .addImm(1);
    597             (void)MIb;
    598             DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
    599             CfCount++;
    600           }
    601 
    602           MachineInstr *IfOrElseInst = IfThenElseStack.back();
    603           IfThenElseStack.pop_back();
    604           CounterPropagateAddr(IfOrElseInst, CfCount);
    605           IfOrElseInst->getOperand(1).setImm(1);
    606           LastAlu.pop_back();
    607           MI->eraseFromParent();
    608           break;
    609         }
    610         case AMDGPU::BREAK: {
    611           CfCount ++;
    612           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
    613               getHWInstrDesc(CF_LOOP_BREAK))
    614               .addImm(0);
    615           LoopStack.back().second.insert(MIb);
    616           MI->eraseFromParent();
    617           break;
    618         }
    619         case AMDGPU::CONTINUE: {
    620           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
    621               getHWInstrDesc(CF_LOOP_CONTINUE))
    622               .addImm(0);
    623           LoopStack.back().second.insert(MIb);
    624           MI->eraseFromParent();
    625           CfCount++;
    626           break;
    627         }
    628         case AMDGPU::RETURN: {
    629           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END));
    630           CfCount++;
    631           MI->eraseFromParent();
    632           if (CfCount % 2) {
    633             BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD));
    634             CfCount++;
    635           }
    636           for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
    637             EmitFetchClause(I, FetchClauses[i], CfCount);
    638           for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
    639             EmitALUClause(I, AluClauses[i], CfCount);
    640         }
    641         default:
    642           if (TII->isExport(MI->getOpcode())) {
    643             DEBUG(dbgs() << CfCount << ":"; MI->dump(););
    644             CfCount++;
    645           }
    646           break;
    647         }
    648       }
    649       for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
    650         MachineInstr *Alu = ToPopAfter[i];
    651         BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
    652             TII->get(AMDGPU::CF_ALU_POP_AFTER))
    653             .addImm(Alu->getOperand(0).getImm())
    654             .addImm(Alu->getOperand(1).getImm())
    655             .addImm(Alu->getOperand(2).getImm())
    656             .addImm(Alu->getOperand(3).getImm())
    657             .addImm(Alu->getOperand(4).getImm())
    658             .addImm(Alu->getOperand(5).getImm())
    659             .addImm(Alu->getOperand(6).getImm())
    660             .addImm(Alu->getOperand(7).getImm())
    661             .addImm(Alu->getOperand(8).getImm());
    662         Alu->eraseFromParent();
    663       }
    664       MFI->StackSize = CFStack.MaxStackSize;
    665     }
    666 
    667     return false;
    668   }
    669 
    670   const char *getPassName() const override {
    671     return "R600 Control Flow Finalizer Pass";
    672   }
    673 };
    674 
    675 char R600ControlFlowFinalizer::ID = 0;
    676 
    677 } // end anonymous namespace
    678 
    679 
    680 llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) {
    681   return new R600ControlFlowFinalizer(TM);
    682 }
    683