1 //===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// This pass compute turns all control flow pseudo instructions into native one 12 /// computing their address on the fly ; it also sets STACK_SIZE info. 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Support/Debug.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUSubtarget.h" 18 #include "R600Defines.h" 19 #include "R600InstrInfo.h" 20 #include "R600MachineFunctionInfo.h" 21 #include "R600RegisterInfo.h" 22 #include "llvm/CodeGen/MachineFunctionPass.h" 23 #include "llvm/CodeGen/MachineInstrBuilder.h" 24 #include "llvm/CodeGen/MachineRegisterInfo.h" 25 #include "llvm/Support/raw_ostream.h" 26 27 using namespace llvm; 28 29 #define DEBUG_TYPE "r600cf" 30 31 namespace { 32 33 struct CFStack { 34 35 enum StackItem { 36 ENTRY = 0, 37 SUB_ENTRY = 1, 38 FIRST_NON_WQM_PUSH = 2, 39 FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3 40 }; 41 42 const AMDGPUSubtarget &ST; 43 std::vector<StackItem> BranchStack; 44 std::vector<StackItem> LoopStack; 45 unsigned MaxStackSize; 46 unsigned CurrentEntries; 47 unsigned CurrentSubEntries; 48 49 CFStack(const AMDGPUSubtarget &st, unsigned ShaderType) : ST(st), 50 // We need to reserve a stack entry for CALL_FS in vertex shaders. 51 MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0), 52 CurrentEntries(0), CurrentSubEntries(0) { } 53 54 unsigned getLoopDepth(); 55 bool branchStackContains(CFStack::StackItem); 56 bool requiresWorkAroundForInst(unsigned Opcode); 57 unsigned getSubEntrySize(CFStack::StackItem Item); 58 void updateMaxStackSize(); 59 void pushBranch(unsigned Opcode, bool isWQM = false); 60 void pushLoop(); 61 void popBranch(); 62 void popLoop(); 63 }; 64 65 unsigned CFStack::getLoopDepth() { 66 return LoopStack.size(); 67 } 68 69 bool CFStack::branchStackContains(CFStack::StackItem Item) { 70 for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(), 71 E = BranchStack.end(); I != E; ++I) { 72 if (*I == Item) 73 return true; 74 } 75 return false; 76 } 77 78 bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { 79 if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST.hasCaymanISA() && 80 getLoopDepth() > 1) 81 return true; 82 83 if (!ST.hasCFAluBug()) 84 return false; 85 86 switch(Opcode) { 87 default: return false; 88 case AMDGPU::CF_ALU_PUSH_BEFORE: 89 case AMDGPU::CF_ALU_ELSE_AFTER: 90 case AMDGPU::CF_ALU_BREAK: 91 case AMDGPU::CF_ALU_CONTINUE: 92 if (CurrentSubEntries == 0) 93 return false; 94 if (ST.getWavefrontSize() == 64) { 95 // We are being conservative here. We only require this work-around if 96 // CurrentSubEntries > 3 && 97 // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0) 98 // 99 // We have to be conservative, because we don't know for certain that 100 // our stack allocation algorithm for Evergreen/NI is correct. Applying this 101 // work-around when CurrentSubEntries > 3 allows us to over-allocate stack 102 // resources without any problems. 103 return CurrentSubEntries > 3; 104 } else { 105 assert(ST.getWavefrontSize() == 32); 106 // We are being conservative here. We only require the work-around if 107 // CurrentSubEntries > 7 && 108 // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0) 109 // See the comment on the wavefront size == 64 case for why we are 110 // being conservative. 111 return CurrentSubEntries > 7; 112 } 113 } 114 } 115 116 unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { 117 switch(Item) { 118 default: 119 return 0; 120 case CFStack::FIRST_NON_WQM_PUSH: 121 assert(!ST.hasCaymanISA()); 122 if (ST.getGeneration() <= AMDGPUSubtarget::R700) { 123 // +1 For the push operation. 124 // +2 Extra space required. 125 return 3; 126 } else { 127 // Some documentation says that this is not necessary on Evergreen, 128 // but experimentation has show that we need to allocate 1 extra 129 // sub-entry for the first non-WQM push. 130 // +1 For the push operation. 131 // +1 Extra space required. 132 return 2; 133 } 134 case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY: 135 assert(ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN); 136 // +1 For the push operation. 137 // +1 Extra space required. 138 return 2; 139 case CFStack::SUB_ENTRY: 140 return 1; 141 } 142 } 143 144 void CFStack::updateMaxStackSize() { 145 unsigned CurrentStackSize = CurrentEntries + 146 (RoundUpToAlignment(CurrentSubEntries, 4) / 4); 147 MaxStackSize = std::max(CurrentStackSize, MaxStackSize); 148 } 149 150 void CFStack::pushBranch(unsigned Opcode, bool isWQM) { 151 CFStack::StackItem Item = CFStack::ENTRY; 152 switch(Opcode) { 153 case AMDGPU::CF_PUSH_EG: 154 case AMDGPU::CF_ALU_PUSH_BEFORE: 155 if (!isWQM) { 156 if (!ST.hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH)) 157 Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI 158 // See comment in 159 // CFStack::getSubEntrySize() 160 else if (CurrentEntries > 0 && 161 ST.getGeneration() > AMDGPUSubtarget::EVERGREEN && 162 !ST.hasCaymanISA() && 163 !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY)) 164 Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY; 165 else 166 Item = CFStack::SUB_ENTRY; 167 } else 168 Item = CFStack::ENTRY; 169 break; 170 } 171 BranchStack.push_back(Item); 172 if (Item == CFStack::ENTRY) 173 CurrentEntries++; 174 else 175 CurrentSubEntries += getSubEntrySize(Item); 176 updateMaxStackSize(); 177 } 178 179 void CFStack::pushLoop() { 180 LoopStack.push_back(CFStack::ENTRY); 181 CurrentEntries++; 182 updateMaxStackSize(); 183 } 184 185 void CFStack::popBranch() { 186 CFStack::StackItem Top = BranchStack.back(); 187 if (Top == CFStack::ENTRY) 188 CurrentEntries--; 189 else 190 CurrentSubEntries-= getSubEntrySize(Top); 191 BranchStack.pop_back(); 192 } 193 194 void CFStack::popLoop() { 195 CurrentEntries--; 196 LoopStack.pop_back(); 197 } 198 199 class R600ControlFlowFinalizer : public MachineFunctionPass { 200 201 private: 202 typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile; 203 204 enum ControlFlowInstruction { 205 CF_TC, 206 CF_VC, 207 CF_CALL_FS, 208 CF_WHILE_LOOP, 209 CF_END_LOOP, 210 CF_LOOP_BREAK, 211 CF_LOOP_CONTINUE, 212 CF_JUMP, 213 CF_ELSE, 214 CF_POP, 215 CF_END 216 }; 217 218 static char ID; 219 const R600InstrInfo *TII; 220 const R600RegisterInfo *TRI; 221 unsigned MaxFetchInst; 222 const AMDGPUSubtarget &ST; 223 224 bool IsTrivialInst(MachineInstr *MI) const { 225 switch (MI->getOpcode()) { 226 case AMDGPU::KILL: 227 case AMDGPU::RETURN: 228 return true; 229 default: 230 return false; 231 } 232 } 233 234 const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const { 235 unsigned Opcode = 0; 236 bool isEg = (ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN); 237 switch (CFI) { 238 case CF_TC: 239 Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600; 240 break; 241 case CF_VC: 242 Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600; 243 break; 244 case CF_CALL_FS: 245 Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600; 246 break; 247 case CF_WHILE_LOOP: 248 Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600; 249 break; 250 case CF_END_LOOP: 251 Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600; 252 break; 253 case CF_LOOP_BREAK: 254 Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600; 255 break; 256 case CF_LOOP_CONTINUE: 257 Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600; 258 break; 259 case CF_JUMP: 260 Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600; 261 break; 262 case CF_ELSE: 263 Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600; 264 break; 265 case CF_POP: 266 Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600; 267 break; 268 case CF_END: 269 if (ST.hasCaymanISA()) { 270 Opcode = AMDGPU::CF_END_CM; 271 break; 272 } 273 Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600; 274 break; 275 } 276 assert (Opcode && "No opcode selected"); 277 return TII->get(Opcode); 278 } 279 280 bool isCompatibleWithClause(const MachineInstr *MI, 281 std::set<unsigned> &DstRegs) const { 282 unsigned DstMI, SrcMI; 283 for (MachineInstr::const_mop_iterator I = MI->operands_begin(), 284 E = MI->operands_end(); I != E; ++I) { 285 const MachineOperand &MO = *I; 286 if (!MO.isReg()) 287 continue; 288 if (MO.isDef()) { 289 unsigned Reg = MO.getReg(); 290 if (AMDGPU::R600_Reg128RegClass.contains(Reg)) 291 DstMI = Reg; 292 else 293 DstMI = TRI->getMatchingSuperReg(Reg, 294 TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), 295 &AMDGPU::R600_Reg128RegClass); 296 } 297 if (MO.isUse()) { 298 unsigned Reg = MO.getReg(); 299 if (AMDGPU::R600_Reg128RegClass.contains(Reg)) 300 SrcMI = Reg; 301 else 302 SrcMI = TRI->getMatchingSuperReg(Reg, 303 TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), 304 &AMDGPU::R600_Reg128RegClass); 305 } 306 } 307 if ((DstRegs.find(SrcMI) == DstRegs.end())) { 308 DstRegs.insert(DstMI); 309 return true; 310 } else 311 return false; 312 } 313 314 ClauseFile 315 MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) 316 const { 317 MachineBasicBlock::iterator ClauseHead = I; 318 std::vector<MachineInstr *> ClauseContent; 319 unsigned AluInstCount = 0; 320 bool IsTex = TII->usesTextureCache(ClauseHead); 321 std::set<unsigned> DstRegs; 322 for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { 323 if (IsTrivialInst(I)) 324 continue; 325 if (AluInstCount >= MaxFetchInst) 326 break; 327 if ((IsTex && !TII->usesTextureCache(I)) || 328 (!IsTex && !TII->usesVertexCache(I))) 329 break; 330 if (!isCompatibleWithClause(I, DstRegs)) 331 break; 332 AluInstCount ++; 333 ClauseContent.push_back(I); 334 } 335 MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), 336 getHWInstrDesc(IsTex?CF_TC:CF_VC)) 337 .addImm(0) // ADDR 338 .addImm(AluInstCount - 1); // COUNT 339 return ClauseFile(MIb, ClauseContent); 340 } 341 342 void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const { 343 static const unsigned LiteralRegs[] = { 344 AMDGPU::ALU_LITERAL_X, 345 AMDGPU::ALU_LITERAL_Y, 346 AMDGPU::ALU_LITERAL_Z, 347 AMDGPU::ALU_LITERAL_W 348 }; 349 const SmallVector<std::pair<MachineOperand *, int64_t>, 3 > Srcs = 350 TII->getSrcs(MI); 351 for (unsigned i = 0, e = Srcs.size(); i < e; ++i) { 352 if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X) 353 continue; 354 int64_t Imm = Srcs[i].second; 355 std::vector<int64_t>::iterator It = 356 std::find(Lits.begin(), Lits.end(), Imm); 357 if (It != Lits.end()) { 358 unsigned Index = It - Lits.begin(); 359 Srcs[i].first->setReg(LiteralRegs[Index]); 360 } else { 361 assert(Lits.size() < 4 && "Too many literals in Instruction Group"); 362 Srcs[i].first->setReg(LiteralRegs[Lits.size()]); 363 Lits.push_back(Imm); 364 } 365 } 366 } 367 368 MachineBasicBlock::iterator insertLiterals( 369 MachineBasicBlock::iterator InsertPos, 370 const std::vector<unsigned> &Literals) const { 371 MachineBasicBlock *MBB = InsertPos->getParent(); 372 for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { 373 unsigned LiteralPair0 = Literals[i]; 374 unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0; 375 InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(), 376 TII->get(AMDGPU::LITERALS)) 377 .addImm(LiteralPair0) 378 .addImm(LiteralPair1); 379 } 380 return InsertPos; 381 } 382 383 ClauseFile 384 MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) 385 const { 386 MachineBasicBlock::iterator ClauseHead = I; 387 std::vector<MachineInstr *> ClauseContent; 388 I++; 389 for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) { 390 if (IsTrivialInst(I)) { 391 ++I; 392 continue; 393 } 394 if (!I->isBundle() && !TII->isALUInstr(I->getOpcode())) 395 break; 396 std::vector<int64_t> Literals; 397 if (I->isBundle()) { 398 MachineInstr *DeleteMI = I; 399 MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); 400 while (++BI != E && BI->isBundledWithPred()) { 401 BI->unbundleFromPred(); 402 for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) { 403 MachineOperand &MO = BI->getOperand(i); 404 if (MO.isReg() && MO.isInternalRead()) 405 MO.setIsInternalRead(false); 406 } 407 getLiteral(BI, Literals); 408 ClauseContent.push_back(BI); 409 } 410 I = BI; 411 DeleteMI->eraseFromParent(); 412 } else { 413 getLiteral(I, Literals); 414 ClauseContent.push_back(I); 415 I++; 416 } 417 for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { 418 unsigned literal0 = Literals[i]; 419 unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0; 420 MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(), 421 TII->get(AMDGPU::LITERALS)) 422 .addImm(literal0) 423 .addImm(literal2); 424 ClauseContent.push_back(MILit); 425 } 426 } 427 assert(ClauseContent.size() < 128 && "ALU clause is too big"); 428 ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1); 429 return ClauseFile(ClauseHead, ClauseContent); 430 } 431 432 void 433 EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, 434 unsigned &CfCount) { 435 CounterPropagateAddr(Clause.first, CfCount); 436 MachineBasicBlock *BB = Clause.first->getParent(); 437 BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE)) 438 .addImm(CfCount); 439 for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { 440 BB->splice(InsertPos, BB, Clause.second[i]); 441 } 442 CfCount += 2 * Clause.second.size(); 443 } 444 445 void 446 EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, 447 unsigned &CfCount) { 448 Clause.first->getOperand(0).setImm(0); 449 CounterPropagateAddr(Clause.first, CfCount); 450 MachineBasicBlock *BB = Clause.first->getParent(); 451 BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE)) 452 .addImm(CfCount); 453 for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { 454 BB->splice(InsertPos, BB, Clause.second[i]); 455 } 456 CfCount += Clause.second.size(); 457 } 458 459 void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const { 460 MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm()); 461 } 462 void CounterPropagateAddr(std::set<MachineInstr *> MIs, unsigned Addr) 463 const { 464 for (std::set<MachineInstr *>::iterator It = MIs.begin(), E = MIs.end(); 465 It != E; ++It) { 466 MachineInstr *MI = *It; 467 CounterPropagateAddr(MI, Addr); 468 } 469 } 470 471 public: 472 R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID), 473 TII (nullptr), TRI(nullptr), 474 ST(tm.getSubtarget<AMDGPUSubtarget>()) { 475 const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>(); 476 MaxFetchInst = ST.getTexVTXClauseSize(); 477 } 478 479 bool runOnMachineFunction(MachineFunction &MF) override { 480 TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo()); 481 TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo()); 482 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 483 484 CFStack CFStack(ST, MFI->ShaderType); 485 for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; 486 ++MB) { 487 MachineBasicBlock &MBB = *MB; 488 unsigned CfCount = 0; 489 std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack; 490 std::vector<MachineInstr * > IfThenElseStack; 491 if (MFI->ShaderType == 1) { 492 BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), 493 getHWInstrDesc(CF_CALL_FS)); 494 CfCount++; 495 } 496 std::vector<ClauseFile> FetchClauses, AluClauses; 497 std::vector<MachineInstr *> LastAlu(1); 498 std::vector<MachineInstr *> ToPopAfter; 499 500 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); 501 I != E;) { 502 if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) { 503 DEBUG(dbgs() << CfCount << ":"; I->dump();); 504 FetchClauses.push_back(MakeFetchClause(MBB, I)); 505 CfCount++; 506 LastAlu.back() = nullptr; 507 continue; 508 } 509 510 MachineBasicBlock::iterator MI = I; 511 if (MI->getOpcode() != AMDGPU::ENDIF) 512 LastAlu.back() = nullptr; 513 if (MI->getOpcode() == AMDGPU::CF_ALU) 514 LastAlu.back() = MI; 515 I++; 516 bool RequiresWorkAround = 517 CFStack.requiresWorkAroundForInst(MI->getOpcode()); 518 switch (MI->getOpcode()) { 519 case AMDGPU::CF_ALU_PUSH_BEFORE: 520 if (RequiresWorkAround) { 521 DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n"); 522 BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG)) 523 .addImm(CfCount + 1) 524 .addImm(1); 525 MI->setDesc(TII->get(AMDGPU::CF_ALU)); 526 CfCount++; 527 CFStack.pushBranch(AMDGPU::CF_PUSH_EG); 528 } else 529 CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE); 530 531 case AMDGPU::CF_ALU: 532 I = MI; 533 AluClauses.push_back(MakeALUClause(MBB, I)); 534 DEBUG(dbgs() << CfCount << ":"; MI->dump();); 535 CfCount++; 536 break; 537 case AMDGPU::WHILELOOP: { 538 CFStack.pushLoop(); 539 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 540 getHWInstrDesc(CF_WHILE_LOOP)) 541 .addImm(1); 542 std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount, 543 std::set<MachineInstr *>()); 544 Pair.second.insert(MIb); 545 LoopStack.push_back(Pair); 546 MI->eraseFromParent(); 547 CfCount++; 548 break; 549 } 550 case AMDGPU::ENDLOOP: { 551 CFStack.popLoop(); 552 std::pair<unsigned, std::set<MachineInstr *> > Pair = 553 LoopStack.back(); 554 LoopStack.pop_back(); 555 CounterPropagateAddr(Pair.second, CfCount); 556 BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP)) 557 .addImm(Pair.first + 1); 558 MI->eraseFromParent(); 559 CfCount++; 560 break; 561 } 562 case AMDGPU::IF_PREDICATE_SET: { 563 LastAlu.push_back(nullptr); 564 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 565 getHWInstrDesc(CF_JUMP)) 566 .addImm(0) 567 .addImm(0); 568 IfThenElseStack.push_back(MIb); 569 DEBUG(dbgs() << CfCount << ":"; MIb->dump();); 570 MI->eraseFromParent(); 571 CfCount++; 572 break; 573 } 574 case AMDGPU::ELSE: { 575 MachineInstr * JumpInst = IfThenElseStack.back(); 576 IfThenElseStack.pop_back(); 577 CounterPropagateAddr(JumpInst, CfCount); 578 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 579 getHWInstrDesc(CF_ELSE)) 580 .addImm(0) 581 .addImm(0); 582 DEBUG(dbgs() << CfCount << ":"; MIb->dump();); 583 IfThenElseStack.push_back(MIb); 584 MI->eraseFromParent(); 585 CfCount++; 586 break; 587 } 588 case AMDGPU::ENDIF: { 589 CFStack.popBranch(); 590 if (LastAlu.back()) { 591 ToPopAfter.push_back(LastAlu.back()); 592 } else { 593 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 594 getHWInstrDesc(CF_POP)) 595 .addImm(CfCount + 1) 596 .addImm(1); 597 (void)MIb; 598 DEBUG(dbgs() << CfCount << ":"; MIb->dump();); 599 CfCount++; 600 } 601 602 MachineInstr *IfOrElseInst = IfThenElseStack.back(); 603 IfThenElseStack.pop_back(); 604 CounterPropagateAddr(IfOrElseInst, CfCount); 605 IfOrElseInst->getOperand(1).setImm(1); 606 LastAlu.pop_back(); 607 MI->eraseFromParent(); 608 break; 609 } 610 case AMDGPU::BREAK: { 611 CfCount ++; 612 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 613 getHWInstrDesc(CF_LOOP_BREAK)) 614 .addImm(0); 615 LoopStack.back().second.insert(MIb); 616 MI->eraseFromParent(); 617 break; 618 } 619 case AMDGPU::CONTINUE: { 620 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 621 getHWInstrDesc(CF_LOOP_CONTINUE)) 622 .addImm(0); 623 LoopStack.back().second.insert(MIb); 624 MI->eraseFromParent(); 625 CfCount++; 626 break; 627 } 628 case AMDGPU::RETURN: { 629 BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END)); 630 CfCount++; 631 MI->eraseFromParent(); 632 if (CfCount % 2) { 633 BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD)); 634 CfCount++; 635 } 636 for (unsigned i = 0, e = FetchClauses.size(); i < e; i++) 637 EmitFetchClause(I, FetchClauses[i], CfCount); 638 for (unsigned i = 0, e = AluClauses.size(); i < e; i++) 639 EmitALUClause(I, AluClauses[i], CfCount); 640 } 641 default: 642 if (TII->isExport(MI->getOpcode())) { 643 DEBUG(dbgs() << CfCount << ":"; MI->dump();); 644 CfCount++; 645 } 646 break; 647 } 648 } 649 for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) { 650 MachineInstr *Alu = ToPopAfter[i]; 651 BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu), 652 TII->get(AMDGPU::CF_ALU_POP_AFTER)) 653 .addImm(Alu->getOperand(0).getImm()) 654 .addImm(Alu->getOperand(1).getImm()) 655 .addImm(Alu->getOperand(2).getImm()) 656 .addImm(Alu->getOperand(3).getImm()) 657 .addImm(Alu->getOperand(4).getImm()) 658 .addImm(Alu->getOperand(5).getImm()) 659 .addImm(Alu->getOperand(6).getImm()) 660 .addImm(Alu->getOperand(7).getImm()) 661 .addImm(Alu->getOperand(8).getImm()); 662 Alu->eraseFromParent(); 663 } 664 MFI->StackSize = CFStack.MaxStackSize; 665 } 666 667 return false; 668 } 669 670 const char *getPassName() const override { 671 return "R600 Control Flow Finalizer Pass"; 672 } 673 }; 674 675 char R600ControlFlowFinalizer::ID = 0; 676 677 } // end anonymous namespace 678 679 680 llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) { 681 return new R600ControlFlowFinalizer(TM); 682 } 683