Home | History | Annotate | Download | only in AMDGPU
      1 //===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 /// This pass compute turns all control flow pseudo instructions into native one
     12 /// computing their address on the fly ; it also sets STACK_SIZE info.
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "llvm/Support/Debug.h"
     16 #include "AMDGPU.h"
     17 #include "AMDGPUSubtarget.h"
     18 #include "R600Defines.h"
     19 #include "R600InstrInfo.h"
     20 #include "R600MachineFunctionInfo.h"
     21 #include "R600RegisterInfo.h"
     22 #include "llvm/CodeGen/MachineFunctionPass.h"
     23 #include "llvm/CodeGen/MachineInstrBuilder.h"
     24 #include "llvm/CodeGen/MachineRegisterInfo.h"
     25 #include "llvm/Support/raw_ostream.h"
     26 
     27 using namespace llvm;
     28 
     29 #define DEBUG_TYPE "r600cf"
     30 
     31 namespace {
     32 
     33 struct CFStack {
     34 
     35   enum StackItem {
     36     ENTRY = 0,
     37     SUB_ENTRY = 1,
     38     FIRST_NON_WQM_PUSH = 2,
     39     FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
     40   };
     41 
     42   const AMDGPUSubtarget *ST;
     43   std::vector<StackItem> BranchStack;
     44   std::vector<StackItem> LoopStack;
     45   unsigned MaxStackSize;
     46   unsigned CurrentEntries;
     47   unsigned CurrentSubEntries;
     48 
     49   CFStack(const AMDGPUSubtarget *st, unsigned ShaderType) : ST(st),
     50       // We need to reserve a stack entry for CALL_FS in vertex shaders.
     51       MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0),
     52       CurrentEntries(0), CurrentSubEntries(0) { }
     53 
     54   unsigned getLoopDepth();
     55   bool branchStackContains(CFStack::StackItem);
     56   bool requiresWorkAroundForInst(unsigned Opcode);
     57   unsigned getSubEntrySize(CFStack::StackItem Item);
     58   void updateMaxStackSize();
     59   void pushBranch(unsigned Opcode, bool isWQM = false);
     60   void pushLoop();
     61   void popBranch();
     62   void popLoop();
     63 };
     64 
     65 unsigned CFStack::getLoopDepth() {
     66   return LoopStack.size();
     67 }
     68 
     69 bool CFStack::branchStackContains(CFStack::StackItem Item) {
     70   for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(),
     71        E = BranchStack.end(); I != E; ++I) {
     72     if (*I == Item)
     73       return true;
     74   }
     75   return false;
     76 }
     77 
     78 bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
     79   if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() &&
     80       getLoopDepth() > 1)
     81     return true;
     82 
     83   if (!ST->hasCFAluBug())
     84     return false;
     85 
     86   switch(Opcode) {
     87   default: return false;
     88   case AMDGPU::CF_ALU_PUSH_BEFORE:
     89   case AMDGPU::CF_ALU_ELSE_AFTER:
     90   case AMDGPU::CF_ALU_BREAK:
     91   case AMDGPU::CF_ALU_CONTINUE:
     92     if (CurrentSubEntries == 0)
     93       return false;
     94     if (ST->getWavefrontSize() == 64) {
     95       // We are being conservative here.  We only require this work-around if
     96       // CurrentSubEntries > 3 &&
     97       // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0)
     98       //
     99       // We have to be conservative, because we don't know for certain that
    100       // our stack allocation algorithm for Evergreen/NI is correct.  Applying this
    101       // work-around when CurrentSubEntries > 3 allows us to over-allocate stack
    102       // resources without any problems.
    103       return CurrentSubEntries > 3;
    104     } else {
    105       assert(ST->getWavefrontSize() == 32);
    106       // We are being conservative here.  We only require the work-around if
    107       // CurrentSubEntries > 7 &&
    108       // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0)
    109       // See the comment on the wavefront size == 64 case for why we are
    110       // being conservative.
    111       return CurrentSubEntries > 7;
    112     }
    113   }
    114 }
    115 
    116 unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
    117   switch(Item) {
    118   default:
    119     return 0;
    120   case CFStack::FIRST_NON_WQM_PUSH:
    121   assert(!ST->hasCaymanISA());
    122   if (ST->getGeneration() <= AMDGPUSubtarget::R700) {
    123     // +1 For the push operation.
    124     // +2 Extra space required.
    125     return 3;
    126   } else {
    127     // Some documentation says that this is not necessary on Evergreen,
    128     // but experimentation has show that we need to allocate 1 extra
    129     // sub-entry for the first non-WQM push.
    130     // +1 For the push operation.
    131     // +1 Extra space required.
    132     return 2;
    133   }
    134   case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
    135     assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
    136     // +1 For the push operation.
    137     // +1 Extra space required.
    138     return 2;
    139   case CFStack::SUB_ENTRY:
    140     return 1;
    141   }
    142 }
    143 
    144 void CFStack::updateMaxStackSize() {
    145   unsigned CurrentStackSize = CurrentEntries +
    146                               (RoundUpToAlignment(CurrentSubEntries, 4) / 4);
    147   MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
    148 }
    149 
    150 void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
    151   CFStack::StackItem Item = CFStack::ENTRY;
    152   switch(Opcode) {
    153   case AMDGPU::CF_PUSH_EG:
    154   case AMDGPU::CF_ALU_PUSH_BEFORE:
    155     if (!isWQM) {
    156       if (!ST->hasCaymanISA() &&
    157           !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
    158         Item = CFStack::FIRST_NON_WQM_PUSH;  // May not be required on Evergreen/NI
    159                                              // See comment in
    160                                              // CFStack::getSubEntrySize()
    161       else if (CurrentEntries > 0 &&
    162                ST->getGeneration() > AMDGPUSubtarget::EVERGREEN &&
    163                !ST->hasCaymanISA() &&
    164                !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
    165         Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
    166       else
    167         Item = CFStack::SUB_ENTRY;
    168     } else
    169       Item = CFStack::ENTRY;
    170     break;
    171   }
    172   BranchStack.push_back(Item);
    173   if (Item == CFStack::ENTRY)
    174     CurrentEntries++;
    175   else
    176     CurrentSubEntries += getSubEntrySize(Item);
    177   updateMaxStackSize();
    178 }
    179 
    180 void CFStack::pushLoop() {
    181   LoopStack.push_back(CFStack::ENTRY);
    182   CurrentEntries++;
    183   updateMaxStackSize();
    184 }
    185 
    186 void CFStack::popBranch() {
    187   CFStack::StackItem Top = BranchStack.back();
    188   if (Top == CFStack::ENTRY)
    189     CurrentEntries--;
    190   else
    191     CurrentSubEntries-= getSubEntrySize(Top);
    192   BranchStack.pop_back();
    193 }
    194 
    195 void CFStack::popLoop() {
    196   CurrentEntries--;
    197   LoopStack.pop_back();
    198 }
    199 
    200 class R600ControlFlowFinalizer : public MachineFunctionPass {
    201 
    202 private:
    203   typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile;
    204 
    205   enum ControlFlowInstruction {
    206     CF_TC,
    207     CF_VC,
    208     CF_CALL_FS,
    209     CF_WHILE_LOOP,
    210     CF_END_LOOP,
    211     CF_LOOP_BREAK,
    212     CF_LOOP_CONTINUE,
    213     CF_JUMP,
    214     CF_ELSE,
    215     CF_POP,
    216     CF_END
    217   };
    218 
    219   static char ID;
    220   const R600InstrInfo *TII;
    221   const R600RegisterInfo *TRI;
    222   unsigned MaxFetchInst;
    223   const AMDGPUSubtarget *ST;
    224 
    225   bool IsTrivialInst(MachineInstr *MI) const {
    226     switch (MI->getOpcode()) {
    227     case AMDGPU::KILL:
    228     case AMDGPU::RETURN:
    229       return true;
    230     default:
    231       return false;
    232     }
    233   }
    234 
    235   const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
    236     unsigned Opcode = 0;
    237     bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
    238     switch (CFI) {
    239     case CF_TC:
    240       Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
    241       break;
    242     case CF_VC:
    243       Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
    244       break;
    245     case CF_CALL_FS:
    246       Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
    247       break;
    248     case CF_WHILE_LOOP:
    249       Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
    250       break;
    251     case CF_END_LOOP:
    252       Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
    253       break;
    254     case CF_LOOP_BREAK:
    255       Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
    256       break;
    257     case CF_LOOP_CONTINUE:
    258       Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
    259       break;
    260     case CF_JUMP:
    261       Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
    262       break;
    263     case CF_ELSE:
    264       Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
    265       break;
    266     case CF_POP:
    267       Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
    268       break;
    269     case CF_END:
    270       if (ST->hasCaymanISA()) {
    271         Opcode = AMDGPU::CF_END_CM;
    272         break;
    273       }
    274       Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
    275       break;
    276     }
    277     assert (Opcode && "No opcode selected");
    278     return TII->get(Opcode);
    279   }
    280 
    281   bool isCompatibleWithClause(const MachineInstr *MI,
    282       std::set<unsigned> &DstRegs) const {
    283     unsigned DstMI, SrcMI;
    284     for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
    285         E = MI->operands_end(); I != E; ++I) {
    286       const MachineOperand &MO = *I;
    287       if (!MO.isReg())
    288         continue;
    289       if (MO.isDef()) {
    290         unsigned Reg = MO.getReg();
    291         if (AMDGPU::R600_Reg128RegClass.contains(Reg))
    292           DstMI = Reg;
    293         else
    294           DstMI = TRI->getMatchingSuperReg(Reg,
    295               TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
    296               &AMDGPU::R600_Reg128RegClass);
    297       }
    298       if (MO.isUse()) {
    299         unsigned Reg = MO.getReg();
    300         if (AMDGPU::R600_Reg128RegClass.contains(Reg))
    301           SrcMI = Reg;
    302         else
    303           SrcMI = TRI->getMatchingSuperReg(Reg,
    304               TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
    305               &AMDGPU::R600_Reg128RegClass);
    306       }
    307     }
    308     if ((DstRegs.find(SrcMI) == DstRegs.end())) {
    309       DstRegs.insert(DstMI);
    310       return true;
    311     } else
    312       return false;
    313   }
    314 
    315   ClauseFile
    316   MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
    317       const {
    318     MachineBasicBlock::iterator ClauseHead = I;
    319     std::vector<MachineInstr *> ClauseContent;
    320     unsigned AluInstCount = 0;
    321     bool IsTex = TII->usesTextureCache(ClauseHead);
    322     std::set<unsigned> DstRegs;
    323     for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
    324       if (IsTrivialInst(I))
    325         continue;
    326       if (AluInstCount >= MaxFetchInst)
    327         break;
    328       if ((IsTex && !TII->usesTextureCache(I)) ||
    329           (!IsTex && !TII->usesVertexCache(I)))
    330         break;
    331       if (!isCompatibleWithClause(I, DstRegs))
    332         break;
    333       AluInstCount ++;
    334       ClauseContent.push_back(I);
    335     }
    336     MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
    337         getHWInstrDesc(IsTex?CF_TC:CF_VC))
    338         .addImm(0) // ADDR
    339         .addImm(AluInstCount - 1); // COUNT
    340     return ClauseFile(MIb, std::move(ClauseContent));
    341   }
    342 
    343   void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const {
    344     static const unsigned LiteralRegs[] = {
    345       AMDGPU::ALU_LITERAL_X,
    346       AMDGPU::ALU_LITERAL_Y,
    347       AMDGPU::ALU_LITERAL_Z,
    348       AMDGPU::ALU_LITERAL_W
    349     };
    350     const SmallVector<std::pair<MachineOperand *, int64_t>, 3 > Srcs =
    351         TII->getSrcs(MI);
    352     for (unsigned i = 0, e = Srcs.size(); i < e; ++i) {
    353       if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X)
    354         continue;
    355       int64_t Imm = Srcs[i].second;
    356       std::vector<int64_t>::iterator It =
    357           std::find(Lits.begin(), Lits.end(), Imm);
    358       if (It != Lits.end()) {
    359         unsigned Index = It - Lits.begin();
    360         Srcs[i].first->setReg(LiteralRegs[Index]);
    361       } else {
    362         assert(Lits.size() < 4 && "Too many literals in Instruction Group");
    363         Srcs[i].first->setReg(LiteralRegs[Lits.size()]);
    364         Lits.push_back(Imm);
    365       }
    366     }
    367   }
    368 
    369   MachineBasicBlock::iterator insertLiterals(
    370       MachineBasicBlock::iterator InsertPos,
    371       const std::vector<unsigned> &Literals) const {
    372     MachineBasicBlock *MBB = InsertPos->getParent();
    373     for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
    374       unsigned LiteralPair0 = Literals[i];
    375       unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
    376       InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(),
    377           TII->get(AMDGPU::LITERALS))
    378           .addImm(LiteralPair0)
    379           .addImm(LiteralPair1);
    380     }
    381     return InsertPos;
    382   }
    383 
    384   ClauseFile
    385   MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
    386       const {
    387     MachineBasicBlock::iterator ClauseHead = I;
    388     std::vector<MachineInstr *> ClauseContent;
    389     I++;
    390     for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) {
    391       if (IsTrivialInst(I)) {
    392         ++I;
    393         continue;
    394       }
    395       if (!I->isBundle() && !TII->isALUInstr(I->getOpcode()))
    396         break;
    397       std::vector<int64_t> Literals;
    398       if (I->isBundle()) {
    399         MachineInstr *DeleteMI = I;
    400         MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
    401         while (++BI != E && BI->isBundledWithPred()) {
    402           BI->unbundleFromPred();
    403           for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) {
    404             MachineOperand &MO = BI->getOperand(i);
    405             if (MO.isReg() && MO.isInternalRead())
    406               MO.setIsInternalRead(false);
    407           }
    408           getLiteral(&*BI, Literals);
    409           ClauseContent.push_back(&*BI);
    410         }
    411         I = BI;
    412         DeleteMI->eraseFromParent();
    413       } else {
    414         getLiteral(I, Literals);
    415         ClauseContent.push_back(I);
    416         I++;
    417       }
    418       for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
    419         unsigned literal0 = Literals[i];
    420         unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0;
    421         MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(),
    422             TII->get(AMDGPU::LITERALS))
    423             .addImm(literal0)
    424             .addImm(literal2);
    425         ClauseContent.push_back(MILit);
    426       }
    427     }
    428     assert(ClauseContent.size() < 128 && "ALU clause is too big");
    429     ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1);
    430     return ClauseFile(ClauseHead, std::move(ClauseContent));
    431   }
    432 
    433   void
    434   EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
    435       unsigned &CfCount) {
    436     CounterPropagateAddr(Clause.first, CfCount);
    437     MachineBasicBlock *BB = Clause.first->getParent();
    438     BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE))
    439         .addImm(CfCount);
    440     for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
    441       BB->splice(InsertPos, BB, Clause.second[i]);
    442     }
    443     CfCount += 2 * Clause.second.size();
    444   }
    445 
    446   void
    447   EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
    448       unsigned &CfCount) {
    449     Clause.first->getOperand(0).setImm(0);
    450     CounterPropagateAddr(Clause.first, CfCount);
    451     MachineBasicBlock *BB = Clause.first->getParent();
    452     BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE))
    453         .addImm(CfCount);
    454     for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
    455       BB->splice(InsertPos, BB, Clause.second[i]);
    456     }
    457     CfCount += Clause.second.size();
    458   }
    459 
    460   void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const {
    461     MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm());
    462   }
    463   void CounterPropagateAddr(const std::set<MachineInstr *> &MIs,
    464                             unsigned Addr) const {
    465     for (MachineInstr *MI : MIs) {
    466       CounterPropagateAddr(MI, Addr);
    467     }
    468   }
    469 
    470 public:
    471   R600ControlFlowFinalizer(TargetMachine &tm)
    472       : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {}
    473 
    474   bool runOnMachineFunction(MachineFunction &MF) override {
    475     ST = &MF.getSubtarget<AMDGPUSubtarget>();
    476     MaxFetchInst = ST->getTexVTXClauseSize();
    477     TII = static_cast<const R600InstrInfo *>(ST->getInstrInfo());
    478     TRI = static_cast<const R600RegisterInfo *>(ST->getRegisterInfo());
    479     R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
    480 
    481     CFStack CFStack(ST, MFI->getShaderType());
    482     for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
    483         ++MB) {
    484       MachineBasicBlock &MBB = *MB;
    485       unsigned CfCount = 0;
    486       std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
    487       std::vector<MachineInstr * > IfThenElseStack;
    488       if (MFI->getShaderType() == ShaderType::VERTEX) {
    489         BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
    490             getHWInstrDesc(CF_CALL_FS));
    491         CfCount++;
    492       }
    493       std::vector<ClauseFile> FetchClauses, AluClauses;
    494       std::vector<MachineInstr *> LastAlu(1);
    495       std::vector<MachineInstr *> ToPopAfter;
    496 
    497       for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
    498           I != E;) {
    499         if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) {
    500           DEBUG(dbgs() << CfCount << ":"; I->dump(););
    501           FetchClauses.push_back(MakeFetchClause(MBB, I));
    502           CfCount++;
    503           LastAlu.back() = nullptr;
    504           continue;
    505         }
    506 
    507         MachineBasicBlock::iterator MI = I;
    508         if (MI->getOpcode() != AMDGPU::ENDIF)
    509           LastAlu.back() = nullptr;
    510         if (MI->getOpcode() == AMDGPU::CF_ALU)
    511           LastAlu.back() = MI;
    512         I++;
    513         bool RequiresWorkAround =
    514             CFStack.requiresWorkAroundForInst(MI->getOpcode());
    515         switch (MI->getOpcode()) {
    516         case AMDGPU::CF_ALU_PUSH_BEFORE:
    517           if (RequiresWorkAround) {
    518             DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n");
    519             BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
    520                 .addImm(CfCount + 1)
    521                 .addImm(1);
    522             MI->setDesc(TII->get(AMDGPU::CF_ALU));
    523             CfCount++;
    524             CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
    525           } else
    526             CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
    527 
    528         case AMDGPU::CF_ALU:
    529           I = MI;
    530           AluClauses.push_back(MakeALUClause(MBB, I));
    531           DEBUG(dbgs() << CfCount << ":"; MI->dump(););
    532           CfCount++;
    533           break;
    534         case AMDGPU::WHILELOOP: {
    535           CFStack.pushLoop();
    536           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
    537               getHWInstrDesc(CF_WHILE_LOOP))
    538               .addImm(1);
    539           std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
    540               std::set<MachineInstr *>());
    541           Pair.second.insert(MIb);
    542           LoopStack.push_back(std::move(Pair));
    543           MI->eraseFromParent();
    544           CfCount++;
    545           break;
    546         }
    547         case AMDGPU::ENDLOOP: {
    548           CFStack.popLoop();
    549           std::pair<unsigned, std::set<MachineInstr *> > Pair =
    550               std::move(LoopStack.back());
    551           LoopStack.pop_back();
    552           CounterPropagateAddr(Pair.second, CfCount);
    553           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP))
    554               .addImm(Pair.first + 1);
    555           MI->eraseFromParent();
    556           CfCount++;
    557           break;
    558         }
    559         case AMDGPU::IF_PREDICATE_SET: {
    560           LastAlu.push_back(nullptr);
    561           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
    562               getHWInstrDesc(CF_JUMP))
    563               .addImm(0)
    564               .addImm(0);
    565           IfThenElseStack.push_back(MIb);
    566           DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
    567           MI->eraseFromParent();
    568           CfCount++;
    569           break;
    570         }
    571         case AMDGPU::ELSE: {
    572           MachineInstr * JumpInst = IfThenElseStack.back();
    573           IfThenElseStack.pop_back();
    574           CounterPropagateAddr(JumpInst, CfCount);
    575           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
    576               getHWInstrDesc(CF_ELSE))
    577               .addImm(0)
    578               .addImm(0);
    579           DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
    580           IfThenElseStack.push_back(MIb);
    581           MI->eraseFromParent();
    582           CfCount++;
    583           break;
    584         }
    585         case AMDGPU::ENDIF: {
    586           CFStack.popBranch();
    587           if (LastAlu.back()) {
    588             ToPopAfter.push_back(LastAlu.back());
    589           } else {
    590             MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
    591                 getHWInstrDesc(CF_POP))
    592                 .addImm(CfCount + 1)
    593                 .addImm(1);
    594             (void)MIb;
    595             DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
    596             CfCount++;
    597           }
    598 
    599           MachineInstr *IfOrElseInst = IfThenElseStack.back();
    600           IfThenElseStack.pop_back();
    601           CounterPropagateAddr(IfOrElseInst, CfCount);
    602           IfOrElseInst->getOperand(1).setImm(1);
    603           LastAlu.pop_back();
    604           MI->eraseFromParent();
    605           break;
    606         }
    607         case AMDGPU::BREAK: {
    608           CfCount ++;
    609           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
    610               getHWInstrDesc(CF_LOOP_BREAK))
    611               .addImm(0);
    612           LoopStack.back().second.insert(MIb);
    613           MI->eraseFromParent();
    614           break;
    615         }
    616         case AMDGPU::CONTINUE: {
    617           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
    618               getHWInstrDesc(CF_LOOP_CONTINUE))
    619               .addImm(0);
    620           LoopStack.back().second.insert(MIb);
    621           MI->eraseFromParent();
    622           CfCount++;
    623           break;
    624         }
    625         case AMDGPU::RETURN: {
    626           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END));
    627           CfCount++;
    628           MI->eraseFromParent();
    629           if (CfCount % 2) {
    630             BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD));
    631             CfCount++;
    632           }
    633           for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
    634             EmitFetchClause(I, FetchClauses[i], CfCount);
    635           for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
    636             EmitALUClause(I, AluClauses[i], CfCount);
    637         }
    638         default:
    639           if (TII->isExport(MI->getOpcode())) {
    640             DEBUG(dbgs() << CfCount << ":"; MI->dump(););
    641             CfCount++;
    642           }
    643           break;
    644         }
    645       }
    646       for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
    647         MachineInstr *Alu = ToPopAfter[i];
    648         BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
    649             TII->get(AMDGPU::CF_ALU_POP_AFTER))
    650             .addImm(Alu->getOperand(0).getImm())
    651             .addImm(Alu->getOperand(1).getImm())
    652             .addImm(Alu->getOperand(2).getImm())
    653             .addImm(Alu->getOperand(3).getImm())
    654             .addImm(Alu->getOperand(4).getImm())
    655             .addImm(Alu->getOperand(5).getImm())
    656             .addImm(Alu->getOperand(6).getImm())
    657             .addImm(Alu->getOperand(7).getImm())
    658             .addImm(Alu->getOperand(8).getImm());
    659         Alu->eraseFromParent();
    660       }
    661       MFI->StackSize = CFStack.MaxStackSize;
    662     }
    663 
    664     return false;
    665   }
    666 
    667   const char *getPassName() const override {
    668     return "R600 Control Flow Finalizer Pass";
    669   }
    670 };
    671 
    672 char R600ControlFlowFinalizer::ID = 0;
    673 
    674 } // end anonymous namespace
    675 
    676 
    677 llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) {
    678   return new R600ControlFlowFinalizer(TM);
    679 }
    680