Home | History | Annotate | Download | only in AMDGPU
      1 //===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 /// This pass compute turns all control flow pseudo instructions into native one
     12 /// computing their address on the fly ; it also sets STACK_SIZE info.
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "llvm/Support/Debug.h"
     16 #include "AMDGPU.h"
     17 #include "AMDGPUSubtarget.h"
     18 #include "R600Defines.h"
     19 #include "R600InstrInfo.h"
     20 #include "R600MachineFunctionInfo.h"
     21 #include "R600RegisterInfo.h"
     22 #include "llvm/CodeGen/MachineFunctionPass.h"
     23 #include "llvm/CodeGen/MachineInstrBuilder.h"
     24 #include "llvm/CodeGen/MachineRegisterInfo.h"
     25 #include "llvm/Support/raw_ostream.h"
     26 
     27 using namespace llvm;
     28 
     29 #define DEBUG_TYPE "r600cf"
     30 
     31 namespace {
     32 
     33 struct CFStack {
     34 
     35   enum StackItem {
     36     ENTRY = 0,
     37     SUB_ENTRY = 1,
     38     FIRST_NON_WQM_PUSH = 2,
     39     FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
     40   };
     41 
     42   const R600Subtarget *ST;
     43   std::vector<StackItem> BranchStack;
     44   std::vector<StackItem> LoopStack;
     45   unsigned MaxStackSize;
     46   unsigned CurrentEntries;
     47   unsigned CurrentSubEntries;
     48 
     49   CFStack(const R600Subtarget *st, CallingConv::ID cc) : ST(st),
     50       // We need to reserve a stack entry for CALL_FS in vertex shaders.
     51       MaxStackSize(cc == CallingConv::AMDGPU_VS ? 1 : 0),
     52       CurrentEntries(0), CurrentSubEntries(0) { }
     53 
     54   unsigned getLoopDepth();
     55   bool branchStackContains(CFStack::StackItem);
     56   bool requiresWorkAroundForInst(unsigned Opcode);
     57   unsigned getSubEntrySize(CFStack::StackItem Item);
     58   void updateMaxStackSize();
     59   void pushBranch(unsigned Opcode, bool isWQM = false);
     60   void pushLoop();
     61   void popBranch();
     62   void popLoop();
     63 };
     64 
     65 unsigned CFStack::getLoopDepth() {
     66   return LoopStack.size();
     67 }
     68 
     69 bool CFStack::branchStackContains(CFStack::StackItem Item) {
     70   for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(),
     71        E = BranchStack.end(); I != E; ++I) {
     72     if (*I == Item)
     73       return true;
     74   }
     75   return false;
     76 }
     77 
     78 bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
     79   if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() &&
     80       getLoopDepth() > 1)
     81     return true;
     82 
     83   if (!ST->hasCFAluBug())
     84     return false;
     85 
     86   switch(Opcode) {
     87   default: return false;
     88   case AMDGPU::CF_ALU_PUSH_BEFORE:
     89   case AMDGPU::CF_ALU_ELSE_AFTER:
     90   case AMDGPU::CF_ALU_BREAK:
     91   case AMDGPU::CF_ALU_CONTINUE:
     92     if (CurrentSubEntries == 0)
     93       return false;
     94     if (ST->getWavefrontSize() == 64) {
     95       // We are being conservative here.  We only require this work-around if
     96       // CurrentSubEntries > 3 &&
     97       // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0)
     98       //
     99       // We have to be conservative, because we don't know for certain that
    100       // our stack allocation algorithm for Evergreen/NI is correct.  Applying this
    101       // work-around when CurrentSubEntries > 3 allows us to over-allocate stack
    102       // resources without any problems.
    103       return CurrentSubEntries > 3;
    104     } else {
    105       assert(ST->getWavefrontSize() == 32);
    106       // We are being conservative here.  We only require the work-around if
    107       // CurrentSubEntries > 7 &&
    108       // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0)
    109       // See the comment on the wavefront size == 64 case for why we are
    110       // being conservative.
    111       return CurrentSubEntries > 7;
    112     }
    113   }
    114 }
    115 
    116 unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
    117   switch(Item) {
    118   default:
    119     return 0;
    120   case CFStack::FIRST_NON_WQM_PUSH:
    121   assert(!ST->hasCaymanISA());
    122   if (ST->getGeneration() <= R600Subtarget::R700) {
    123     // +1 For the push operation.
    124     // +2 Extra space required.
    125     return 3;
    126   } else {
    127     // Some documentation says that this is not necessary on Evergreen,
    128     // but experimentation has show that we need to allocate 1 extra
    129     // sub-entry for the first non-WQM push.
    130     // +1 For the push operation.
    131     // +1 Extra space required.
    132     return 2;
    133   }
    134   case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
    135     assert(ST->getGeneration() >= R600Subtarget::EVERGREEN);
    136     // +1 For the push operation.
    137     // +1 Extra space required.
    138     return 2;
    139   case CFStack::SUB_ENTRY:
    140     return 1;
    141   }
    142 }
    143 
    144 void CFStack::updateMaxStackSize() {
    145   unsigned CurrentStackSize =
    146       CurrentEntries + (alignTo(CurrentSubEntries, 4) / 4);
    147   MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
    148 }
    149 
    150 void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
    151   CFStack::StackItem Item = CFStack::ENTRY;
    152   switch(Opcode) {
    153   case AMDGPU::CF_PUSH_EG:
    154   case AMDGPU::CF_ALU_PUSH_BEFORE:
    155     if (!isWQM) {
    156       if (!ST->hasCaymanISA() &&
    157           !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
    158         Item = CFStack::FIRST_NON_WQM_PUSH;  // May not be required on Evergreen/NI
    159                                              // See comment in
    160                                              // CFStack::getSubEntrySize()
    161       else if (CurrentEntries > 0 &&
    162                ST->getGeneration() > R600Subtarget::EVERGREEN &&
    163                !ST->hasCaymanISA() &&
    164                !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
    165         Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
    166       else
    167         Item = CFStack::SUB_ENTRY;
    168     } else
    169       Item = CFStack::ENTRY;
    170     break;
    171   }
    172   BranchStack.push_back(Item);
    173   if (Item == CFStack::ENTRY)
    174     CurrentEntries++;
    175   else
    176     CurrentSubEntries += getSubEntrySize(Item);
    177   updateMaxStackSize();
    178 }
    179 
    180 void CFStack::pushLoop() {
    181   LoopStack.push_back(CFStack::ENTRY);
    182   CurrentEntries++;
    183   updateMaxStackSize();
    184 }
    185 
    186 void CFStack::popBranch() {
    187   CFStack::StackItem Top = BranchStack.back();
    188   if (Top == CFStack::ENTRY)
    189     CurrentEntries--;
    190   else
    191     CurrentSubEntries-= getSubEntrySize(Top);
    192   BranchStack.pop_back();
    193 }
    194 
    195 void CFStack::popLoop() {
    196   CurrentEntries--;
    197   LoopStack.pop_back();
    198 }
    199 
    200 class R600ControlFlowFinalizer : public MachineFunctionPass {
    201 
    202 private:
    203   typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile;
    204 
    205   enum ControlFlowInstruction {
    206     CF_TC,
    207     CF_VC,
    208     CF_CALL_FS,
    209     CF_WHILE_LOOP,
    210     CF_END_LOOP,
    211     CF_LOOP_BREAK,
    212     CF_LOOP_CONTINUE,
    213     CF_JUMP,
    214     CF_ELSE,
    215     CF_POP,
    216     CF_END
    217   };
    218 
    219   static char ID;
    220   const R600InstrInfo *TII;
    221   const R600RegisterInfo *TRI;
    222   unsigned MaxFetchInst;
    223   const R600Subtarget *ST;
    224 
    225   bool IsTrivialInst(MachineInstr &MI) const {
    226     switch (MI.getOpcode()) {
    227     case AMDGPU::KILL:
    228     case AMDGPU::RETURN:
    229       return true;
    230     default:
    231       return false;
    232     }
    233   }
    234 
    235   const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
    236     unsigned Opcode = 0;
    237     bool isEg = (ST->getGeneration() >= R600Subtarget::EVERGREEN);
    238     switch (CFI) {
    239     case CF_TC:
    240       Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
    241       break;
    242     case CF_VC:
    243       Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
    244       break;
    245     case CF_CALL_FS:
    246       Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
    247       break;
    248     case CF_WHILE_LOOP:
    249       Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
    250       break;
    251     case CF_END_LOOP:
    252       Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
    253       break;
    254     case CF_LOOP_BREAK:
    255       Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
    256       break;
    257     case CF_LOOP_CONTINUE:
    258       Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
    259       break;
    260     case CF_JUMP:
    261       Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
    262       break;
    263     case CF_ELSE:
    264       Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
    265       break;
    266     case CF_POP:
    267       Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
    268       break;
    269     case CF_END:
    270       if (ST->hasCaymanISA()) {
    271         Opcode = AMDGPU::CF_END_CM;
    272         break;
    273       }
    274       Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
    275       break;
    276     }
    277     assert (Opcode && "No opcode selected");
    278     return TII->get(Opcode);
    279   }
    280 
    281   bool isCompatibleWithClause(const MachineInstr &MI,
    282                               std::set<unsigned> &DstRegs) const {
    283     unsigned DstMI, SrcMI;
    284     for (MachineInstr::const_mop_iterator I = MI.operands_begin(),
    285                                           E = MI.operands_end();
    286          I != E; ++I) {
    287       const MachineOperand &MO = *I;
    288       if (!MO.isReg())
    289         continue;
    290       if (MO.isDef()) {
    291         unsigned Reg = MO.getReg();
    292         if (AMDGPU::R600_Reg128RegClass.contains(Reg))
    293           DstMI = Reg;
    294         else
    295           DstMI = TRI->getMatchingSuperReg(Reg,
    296               TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
    297               &AMDGPU::R600_Reg128RegClass);
    298       }
    299       if (MO.isUse()) {
    300         unsigned Reg = MO.getReg();
    301         if (AMDGPU::R600_Reg128RegClass.contains(Reg))
    302           SrcMI = Reg;
    303         else
    304           SrcMI = TRI->getMatchingSuperReg(Reg,
    305               TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
    306               &AMDGPU::R600_Reg128RegClass);
    307       }
    308     }
    309     if ((DstRegs.find(SrcMI) == DstRegs.end())) {
    310       DstRegs.insert(DstMI);
    311       return true;
    312     } else
    313       return false;
    314   }
    315 
    316   ClauseFile
    317   MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
    318       const {
    319     MachineBasicBlock::iterator ClauseHead = I;
    320     std::vector<MachineInstr *> ClauseContent;
    321     unsigned AluInstCount = 0;
    322     bool IsTex = TII->usesTextureCache(*ClauseHead);
    323     std::set<unsigned> DstRegs;
    324     for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
    325       if (IsTrivialInst(*I))
    326         continue;
    327       if (AluInstCount >= MaxFetchInst)
    328         break;
    329       if ((IsTex && !TII->usesTextureCache(*I)) ||
    330           (!IsTex && !TII->usesVertexCache(*I)))
    331         break;
    332       if (!isCompatibleWithClause(*I, DstRegs))
    333         break;
    334       AluInstCount ++;
    335       ClauseContent.push_back(&*I);
    336     }
    337     MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
    338         getHWInstrDesc(IsTex?CF_TC:CF_VC))
    339         .addImm(0) // ADDR
    340         .addImm(AluInstCount - 1); // COUNT
    341     return ClauseFile(MIb, std::move(ClauseContent));
    342   }
    343 
    344   void getLiteral(MachineInstr &MI, std::vector<MachineOperand *> &Lits) const {
    345     static const unsigned LiteralRegs[] = {
    346       AMDGPU::ALU_LITERAL_X,
    347       AMDGPU::ALU_LITERAL_Y,
    348       AMDGPU::ALU_LITERAL_Z,
    349       AMDGPU::ALU_LITERAL_W
    350     };
    351     const SmallVector<std::pair<MachineOperand *, int64_t>, 3> Srcs =
    352         TII->getSrcs(MI);
    353     for (const auto &Src:Srcs) {
    354       if (Src.first->getReg() != AMDGPU::ALU_LITERAL_X)
    355         continue;
    356       int64_t Imm = Src.second;
    357       std::vector<MachineOperand*>::iterator It =
    358           std::find_if(Lits.begin(), Lits.end(),
    359                     [&](MachineOperand* val)
    360                         { return val->isImm() && (val->getImm() == Imm);});
    361 
    362       // Get corresponding Operand
    363       MachineOperand &Operand = MI.getOperand(
    364           TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal));
    365 
    366       if (It != Lits.end()) {
    367         // Reuse existing literal reg
    368         unsigned Index = It - Lits.begin();
    369         Src.first->setReg(LiteralRegs[Index]);
    370       } else {
    371         // Allocate new literal reg
    372         assert(Lits.size() < 4 && "Too many literals in Instruction Group");
    373         Src.first->setReg(LiteralRegs[Lits.size()]);
    374         Lits.push_back(&Operand);
    375       }
    376     }
    377   }
    378 
    379   MachineBasicBlock::iterator insertLiterals(
    380       MachineBasicBlock::iterator InsertPos,
    381       const std::vector<unsigned> &Literals) const {
    382     MachineBasicBlock *MBB = InsertPos->getParent();
    383     for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
    384       unsigned LiteralPair0 = Literals[i];
    385       unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
    386       InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(),
    387           TII->get(AMDGPU::LITERALS))
    388           .addImm(LiteralPair0)
    389           .addImm(LiteralPair1);
    390     }
    391     return InsertPos;
    392   }
    393 
    394   ClauseFile
    395   MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
    396       const {
    397     MachineInstr &ClauseHead = *I;
    398     std::vector<MachineInstr *> ClauseContent;
    399     I++;
    400     for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) {
    401       if (IsTrivialInst(*I)) {
    402         ++I;
    403         continue;
    404       }
    405       if (!I->isBundle() && !TII->isALUInstr(I->getOpcode()))
    406         break;
    407       std::vector<MachineOperand *>Literals;
    408       if (I->isBundle()) {
    409         MachineInstr &DeleteMI = *I;
    410         MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
    411         while (++BI != E && BI->isBundledWithPred()) {
    412           BI->unbundleFromPred();
    413           for (MachineOperand &MO : BI->operands()) {
    414             if (MO.isReg() && MO.isInternalRead())
    415               MO.setIsInternalRead(false);
    416           }
    417           getLiteral(*BI, Literals);
    418           ClauseContent.push_back(&*BI);
    419         }
    420         I = BI;
    421         DeleteMI.eraseFromParent();
    422       } else {
    423         getLiteral(*I, Literals);
    424         ClauseContent.push_back(&*I);
    425         I++;
    426       }
    427       for (unsigned i = 0, e = Literals.size(); i < e; i += 2) {
    428         MachineInstrBuilder MILit = BuildMI(MBB, I, I->getDebugLoc(),
    429             TII->get(AMDGPU::LITERALS));
    430         if (Literals[i]->isImm()) {
    431             MILit.addImm(Literals[i]->getImm());
    432         } else {
    433             MILit.addGlobalAddress(Literals[i]->getGlobal(),
    434                                    Literals[i]->getOffset());
    435         }
    436         if (i + 1 < e) {
    437           if (Literals[i + 1]->isImm()) {
    438             MILit.addImm(Literals[i + 1]->getImm());
    439           } else {
    440             MILit.addGlobalAddress(Literals[i + 1]->getGlobal(),
    441                                    Literals[i + 1]->getOffset());
    442           }
    443         } else
    444           MILit.addImm(0);
    445         ClauseContent.push_back(MILit);
    446       }
    447     }
    448     assert(ClauseContent.size() < 128 && "ALU clause is too big");
    449     ClauseHead.getOperand(7).setImm(ClauseContent.size() - 1);
    450     return ClauseFile(&ClauseHead, std::move(ClauseContent));
    451   }
    452 
    453   void
    454   EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
    455       unsigned &CfCount) {
    456     CounterPropagateAddr(*Clause.first, CfCount);
    457     MachineBasicBlock *BB = Clause.first->getParent();
    458     BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE))
    459         .addImm(CfCount);
    460     for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
    461       BB->splice(InsertPos, BB, Clause.second[i]);
    462     }
    463     CfCount += 2 * Clause.second.size();
    464   }
    465 
    466   void
    467   EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
    468       unsigned &CfCount) {
    469     Clause.first->getOperand(0).setImm(0);
    470     CounterPropagateAddr(*Clause.first, CfCount);
    471     MachineBasicBlock *BB = Clause.first->getParent();
    472     BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE))
    473         .addImm(CfCount);
    474     for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
    475       BB->splice(InsertPos, BB, Clause.second[i]);
    476     }
    477     CfCount += Clause.second.size();
    478   }
    479 
    480   void CounterPropagateAddr(MachineInstr &MI, unsigned Addr) const {
    481     MI.getOperand(0).setImm(Addr + MI.getOperand(0).getImm());
    482   }
    483   void CounterPropagateAddr(const std::set<MachineInstr *> &MIs,
    484                             unsigned Addr) const {
    485     for (MachineInstr *MI : MIs) {
    486       CounterPropagateAddr(*MI, Addr);
    487     }
    488   }
    489 
    490 public:
    491   R600ControlFlowFinalizer(TargetMachine &tm)
    492       : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {}
    493 
    494   bool runOnMachineFunction(MachineFunction &MF) override {
    495     ST = &MF.getSubtarget<R600Subtarget>();
    496     MaxFetchInst = ST->getTexVTXClauseSize();
    497     TII = ST->getInstrInfo();
    498     TRI = ST->getRegisterInfo();
    499 
    500     R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
    501 
    502     CFStack CFStack(ST, MF.getFunction()->getCallingConv());
    503     for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
    504         ++MB) {
    505       MachineBasicBlock &MBB = *MB;
    506       unsigned CfCount = 0;
    507       std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
    508       std::vector<MachineInstr * > IfThenElseStack;
    509       if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_VS) {
    510         BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
    511             getHWInstrDesc(CF_CALL_FS));
    512         CfCount++;
    513       }
    514       std::vector<ClauseFile> FetchClauses, AluClauses;
    515       std::vector<MachineInstr *> LastAlu(1);
    516       std::vector<MachineInstr *> ToPopAfter;
    517 
    518       for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
    519           I != E;) {
    520         if (TII->usesTextureCache(*I) || TII->usesVertexCache(*I)) {
    521           DEBUG(dbgs() << CfCount << ":"; I->dump(););
    522           FetchClauses.push_back(MakeFetchClause(MBB, I));
    523           CfCount++;
    524           LastAlu.back() = nullptr;
    525           continue;
    526         }
    527 
    528         MachineBasicBlock::iterator MI = I;
    529         if (MI->getOpcode() != AMDGPU::ENDIF)
    530           LastAlu.back() = nullptr;
    531         if (MI->getOpcode() == AMDGPU::CF_ALU)
    532           LastAlu.back() = &*MI;
    533         I++;
    534         bool RequiresWorkAround =
    535             CFStack.requiresWorkAroundForInst(MI->getOpcode());
    536         switch (MI->getOpcode()) {
    537         case AMDGPU::CF_ALU_PUSH_BEFORE:
    538           if (RequiresWorkAround) {
    539             DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n");
    540             BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
    541                 .addImm(CfCount + 1)
    542                 .addImm(1);
    543             MI->setDesc(TII->get(AMDGPU::CF_ALU));
    544             CfCount++;
    545             CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
    546           } else
    547             CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
    548 
    549         case AMDGPU::CF_ALU:
    550           I = MI;
    551           AluClauses.push_back(MakeALUClause(MBB, I));
    552           DEBUG(dbgs() << CfCount << ":"; MI->dump(););
    553           CfCount++;
    554           break;
    555         case AMDGPU::WHILELOOP: {
    556           CFStack.pushLoop();
    557           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
    558               getHWInstrDesc(CF_WHILE_LOOP))
    559               .addImm(1);
    560           std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
    561               std::set<MachineInstr *>());
    562           Pair.second.insert(MIb);
    563           LoopStack.push_back(std::move(Pair));
    564           MI->eraseFromParent();
    565           CfCount++;
    566           break;
    567         }
    568         case AMDGPU::ENDLOOP: {
    569           CFStack.popLoop();
    570           std::pair<unsigned, std::set<MachineInstr *> > Pair =
    571               std::move(LoopStack.back());
    572           LoopStack.pop_back();
    573           CounterPropagateAddr(Pair.second, CfCount);
    574           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP))
    575               .addImm(Pair.first + 1);
    576           MI->eraseFromParent();
    577           CfCount++;
    578           break;
    579         }
    580         case AMDGPU::IF_PREDICATE_SET: {
    581           LastAlu.push_back(nullptr);
    582           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
    583               getHWInstrDesc(CF_JUMP))
    584               .addImm(0)
    585               .addImm(0);
    586           IfThenElseStack.push_back(MIb);
    587           DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
    588           MI->eraseFromParent();
    589           CfCount++;
    590           break;
    591         }
    592         case AMDGPU::ELSE: {
    593           MachineInstr * JumpInst = IfThenElseStack.back();
    594           IfThenElseStack.pop_back();
    595           CounterPropagateAddr(*JumpInst, CfCount);
    596           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
    597               getHWInstrDesc(CF_ELSE))
    598               .addImm(0)
    599               .addImm(0);
    600           DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
    601           IfThenElseStack.push_back(MIb);
    602           MI->eraseFromParent();
    603           CfCount++;
    604           break;
    605         }
    606         case AMDGPU::ENDIF: {
    607           CFStack.popBranch();
    608           if (LastAlu.back()) {
    609             ToPopAfter.push_back(LastAlu.back());
    610           } else {
    611             MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
    612                 getHWInstrDesc(CF_POP))
    613                 .addImm(CfCount + 1)
    614                 .addImm(1);
    615             (void)MIb;
    616             DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
    617             CfCount++;
    618           }
    619 
    620           MachineInstr *IfOrElseInst = IfThenElseStack.back();
    621           IfThenElseStack.pop_back();
    622           CounterPropagateAddr(*IfOrElseInst, CfCount);
    623           IfOrElseInst->getOperand(1).setImm(1);
    624           LastAlu.pop_back();
    625           MI->eraseFromParent();
    626           break;
    627         }
    628         case AMDGPU::BREAK: {
    629           CfCount ++;
    630           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
    631               getHWInstrDesc(CF_LOOP_BREAK))
    632               .addImm(0);
    633           LoopStack.back().second.insert(MIb);
    634           MI->eraseFromParent();
    635           break;
    636         }
    637         case AMDGPU::CONTINUE: {
    638           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
    639               getHWInstrDesc(CF_LOOP_CONTINUE))
    640               .addImm(0);
    641           LoopStack.back().second.insert(MIb);
    642           MI->eraseFromParent();
    643           CfCount++;
    644           break;
    645         }
    646         case AMDGPU::RETURN: {
    647           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END));
    648           CfCount++;
    649           if (CfCount % 2) {
    650             BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD));
    651             CfCount++;
    652           }
    653           MI->eraseFromParent();
    654           for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
    655             EmitFetchClause(I, FetchClauses[i], CfCount);
    656           for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
    657             EmitALUClause(I, AluClauses[i], CfCount);
    658           break;
    659         }
    660         default:
    661           if (TII->isExport(MI->getOpcode())) {
    662             DEBUG(dbgs() << CfCount << ":"; MI->dump(););
    663             CfCount++;
    664           }
    665           break;
    666         }
    667       }
    668       for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
    669         MachineInstr *Alu = ToPopAfter[i];
    670         BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
    671             TII->get(AMDGPU::CF_ALU_POP_AFTER))
    672             .addImm(Alu->getOperand(0).getImm())
    673             .addImm(Alu->getOperand(1).getImm())
    674             .addImm(Alu->getOperand(2).getImm())
    675             .addImm(Alu->getOperand(3).getImm())
    676             .addImm(Alu->getOperand(4).getImm())
    677             .addImm(Alu->getOperand(5).getImm())
    678             .addImm(Alu->getOperand(6).getImm())
    679             .addImm(Alu->getOperand(7).getImm())
    680             .addImm(Alu->getOperand(8).getImm());
    681         Alu->eraseFromParent();
    682       }
    683       MFI->StackSize = CFStack.MaxStackSize;
    684     }
    685 
    686     return false;
    687   }
    688 
    689   const char *getPassName() const override {
    690     return "R600 Control Flow Finalizer Pass";
    691   }
    692 };
    693 
    694 char R600ControlFlowFinalizer::ID = 0;
    695 
    696 } // end anonymous namespace
    697 
    698 
    699 llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) {
    700   return new R600ControlFlowFinalizer(TM);
    701 }
    702