1 //===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// This pass compute turns all control flow pseudo instructions into native one 12 /// computing their address on the fly ; it also sets STACK_SIZE info. 13 //===----------------------------------------------------------------------===// 14 15 #define DEBUG_TYPE "r600cf" 16 #include "llvm/Support/Debug.h" 17 #include "AMDGPU.h" 18 #include "R600Defines.h" 19 #include "R600InstrInfo.h" 20 #include "R600MachineFunctionInfo.h" 21 #include "R600RegisterInfo.h" 22 #include "llvm/CodeGen/MachineFunctionPass.h" 23 #include "llvm/CodeGen/MachineInstrBuilder.h" 24 #include "llvm/CodeGen/MachineRegisterInfo.h" 25 #include "llvm/Support/raw_ostream.h" 26 27 using namespace llvm; 28 29 namespace { 30 31 class R600ControlFlowFinalizer : public MachineFunctionPass { 32 33 private: 34 typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile; 35 36 enum ControlFlowInstruction { 37 CF_TC, 38 CF_VC, 39 CF_CALL_FS, 40 CF_WHILE_LOOP, 41 CF_END_LOOP, 42 CF_LOOP_BREAK, 43 CF_LOOP_CONTINUE, 44 CF_JUMP, 45 CF_ELSE, 46 CF_POP, 47 CF_END 48 }; 49 50 static char ID; 51 const R600InstrInfo *TII; 52 const R600RegisterInfo *TRI; 53 unsigned MaxFetchInst; 54 const AMDGPUSubtarget &ST; 55 56 bool IsTrivialInst(MachineInstr *MI) const { 57 switch (MI->getOpcode()) { 58 case AMDGPU::KILL: 59 case AMDGPU::RETURN: 60 return true; 61 default: 62 return false; 63 } 64 } 65 66 const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const { 67 unsigned Opcode = 0; 68 bool isEg = (ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN); 69 switch (CFI) { 70 case CF_TC: 71 Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600; 72 break; 73 case CF_VC: 74 Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600; 75 break; 76 case CF_CALL_FS: 77 Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600; 78 break; 79 case CF_WHILE_LOOP: 80 Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600; 81 break; 82 case CF_END_LOOP: 83 Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600; 84 break; 85 case CF_LOOP_BREAK: 86 Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600; 87 break; 88 case CF_LOOP_CONTINUE: 89 Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600; 90 break; 91 case CF_JUMP: 92 Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600; 93 break; 94 case CF_ELSE: 95 Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600; 96 break; 97 case CF_POP: 98 Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600; 99 break; 100 case CF_END: 101 if (ST.hasCaymanISA()) { 102 Opcode = AMDGPU::CF_END_CM; 103 break; 104 } 105 Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600; 106 break; 107 } 108 assert (Opcode && "No opcode selected"); 109 return TII->get(Opcode); 110 } 111 112 bool isCompatibleWithClause(const MachineInstr *MI, 113 std::set<unsigned> &DstRegs) const { 114 unsigned DstMI, SrcMI; 115 for (MachineInstr::const_mop_iterator I = MI->operands_begin(), 116 E = MI->operands_end(); I != E; ++I) { 117 const MachineOperand &MO = *I; 118 if (!MO.isReg()) 119 continue; 120 if (MO.isDef()) { 121 unsigned Reg = MO.getReg(); 122 if (AMDGPU::R600_Reg128RegClass.contains(Reg)) 123 DstMI = Reg; 124 else 125 DstMI = TRI->getMatchingSuperReg(Reg, 126 TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), 127 &AMDGPU::R600_Reg128RegClass); 128 } 129 if (MO.isUse()) { 130 unsigned Reg = MO.getReg(); 131 if (AMDGPU::R600_Reg128RegClass.contains(Reg)) 132 SrcMI = Reg; 133 else 134 SrcMI = TRI->getMatchingSuperReg(Reg, 135 TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), 136 &AMDGPU::R600_Reg128RegClass); 137 } 138 } 139 if ((DstRegs.find(SrcMI) == DstRegs.end())) { 140 DstRegs.insert(DstMI); 141 return true; 142 } else 143 return false; 144 } 145 146 ClauseFile 147 MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) 148 const { 149 MachineBasicBlock::iterator ClauseHead = I; 150 std::vector<MachineInstr *> ClauseContent; 151 unsigned AluInstCount = 0; 152 bool IsTex = TII->usesTextureCache(ClauseHead); 153 std::set<unsigned> DstRegs; 154 for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { 155 if (IsTrivialInst(I)) 156 continue; 157 if (AluInstCount >= MaxFetchInst) 158 break; 159 if ((IsTex && !TII->usesTextureCache(I)) || 160 (!IsTex && !TII->usesVertexCache(I))) 161 break; 162 if (!isCompatibleWithClause(I, DstRegs)) 163 break; 164 AluInstCount ++; 165 ClauseContent.push_back(I); 166 } 167 MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), 168 getHWInstrDesc(IsTex?CF_TC:CF_VC)) 169 .addImm(0) // ADDR 170 .addImm(AluInstCount - 1); // COUNT 171 return ClauseFile(MIb, ClauseContent); 172 } 173 174 void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const { 175 static const unsigned LiteralRegs[] = { 176 AMDGPU::ALU_LITERAL_X, 177 AMDGPU::ALU_LITERAL_Y, 178 AMDGPU::ALU_LITERAL_Z, 179 AMDGPU::ALU_LITERAL_W 180 }; 181 const SmallVector<std::pair<MachineOperand *, int64_t>, 3 > Srcs = 182 TII->getSrcs(MI); 183 for (unsigned i = 0, e = Srcs.size(); i < e; ++i) { 184 if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X) 185 continue; 186 int64_t Imm = Srcs[i].second; 187 std::vector<int64_t>::iterator It = 188 std::find(Lits.begin(), Lits.end(), Imm); 189 if (It != Lits.end()) { 190 unsigned Index = It - Lits.begin(); 191 Srcs[i].first->setReg(LiteralRegs[Index]); 192 } else { 193 assert(Lits.size() < 4 && "Too many literals in Instruction Group"); 194 Srcs[i].first->setReg(LiteralRegs[Lits.size()]); 195 Lits.push_back(Imm); 196 } 197 } 198 } 199 200 MachineBasicBlock::iterator insertLiterals( 201 MachineBasicBlock::iterator InsertPos, 202 const std::vector<unsigned> &Literals) const { 203 MachineBasicBlock *MBB = InsertPos->getParent(); 204 for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { 205 unsigned LiteralPair0 = Literals[i]; 206 unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0; 207 InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(), 208 TII->get(AMDGPU::LITERALS)) 209 .addImm(LiteralPair0) 210 .addImm(LiteralPair1); 211 } 212 return InsertPos; 213 } 214 215 ClauseFile 216 MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) 217 const { 218 MachineBasicBlock::iterator ClauseHead = I; 219 std::vector<MachineInstr *> ClauseContent; 220 I++; 221 for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) { 222 if (IsTrivialInst(I)) { 223 ++I; 224 continue; 225 } 226 if (!I->isBundle() && !TII->isALUInstr(I->getOpcode())) 227 break; 228 std::vector<int64_t> Literals; 229 if (I->isBundle()) { 230 MachineInstr *DeleteMI = I; 231 MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); 232 while (++BI != E && BI->isBundledWithPred()) { 233 BI->unbundleFromPred(); 234 for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) { 235 MachineOperand &MO = BI->getOperand(i); 236 if (MO.isReg() && MO.isInternalRead()) 237 MO.setIsInternalRead(false); 238 } 239 getLiteral(BI, Literals); 240 ClauseContent.push_back(BI); 241 } 242 I = BI; 243 DeleteMI->eraseFromParent(); 244 } else { 245 getLiteral(I, Literals); 246 ClauseContent.push_back(I); 247 I++; 248 } 249 for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { 250 unsigned literal0 = Literals[i]; 251 unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0; 252 MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(), 253 TII->get(AMDGPU::LITERALS)) 254 .addImm(literal0) 255 .addImm(literal2); 256 ClauseContent.push_back(MILit); 257 } 258 } 259 assert(ClauseContent.size() < 128 && "ALU clause is too big"); 260 ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1); 261 return ClauseFile(ClauseHead, ClauseContent); 262 } 263 264 void 265 EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, 266 unsigned &CfCount) { 267 CounterPropagateAddr(Clause.first, CfCount); 268 MachineBasicBlock *BB = Clause.first->getParent(); 269 BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE)) 270 .addImm(CfCount); 271 for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { 272 BB->splice(InsertPos, BB, Clause.second[i]); 273 } 274 CfCount += 2 * Clause.second.size(); 275 } 276 277 void 278 EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, 279 unsigned &CfCount) { 280 Clause.first->getOperand(0).setImm(0); 281 CounterPropagateAddr(Clause.first, CfCount); 282 MachineBasicBlock *BB = Clause.first->getParent(); 283 BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE)) 284 .addImm(CfCount); 285 for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { 286 BB->splice(InsertPos, BB, Clause.second[i]); 287 } 288 CfCount += Clause.second.size(); 289 } 290 291 void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const { 292 MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm()); 293 } 294 void CounterPropagateAddr(std::set<MachineInstr *> MIs, unsigned Addr) 295 const { 296 for (std::set<MachineInstr *>::iterator It = MIs.begin(), E = MIs.end(); 297 It != E; ++It) { 298 MachineInstr *MI = *It; 299 CounterPropagateAddr(MI, Addr); 300 } 301 } 302 303 unsigned getHWStackSize(unsigned StackSubEntry, bool hasPush) const { 304 switch (ST.getGeneration()) { 305 case AMDGPUSubtarget::R600: 306 case AMDGPUSubtarget::R700: 307 if (hasPush) 308 StackSubEntry += 2; 309 break; 310 case AMDGPUSubtarget::EVERGREEN: 311 if (hasPush) 312 StackSubEntry ++; 313 case AMDGPUSubtarget::NORTHERN_ISLANDS: 314 StackSubEntry += 2; 315 break; 316 default: llvm_unreachable("Not a VLIW4/VLIW5 GPU"); 317 } 318 return (StackSubEntry + 3)/4; // Need ceil value of StackSubEntry/4 319 } 320 321 public: 322 R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID), 323 TII (0), TRI(0), 324 ST(tm.getSubtarget<AMDGPUSubtarget>()) { 325 const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>(); 326 MaxFetchInst = ST.getTexVTXClauseSize(); 327 } 328 329 virtual bool runOnMachineFunction(MachineFunction &MF) { 330 TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo()); 331 TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo()); 332 333 unsigned MaxStack = 0; 334 unsigned CurrentStack = 0; 335 bool HasPush = false; 336 for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; 337 ++MB) { 338 MachineBasicBlock &MBB = *MB; 339 unsigned CfCount = 0; 340 std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack; 341 std::vector<MachineInstr * > IfThenElseStack; 342 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 343 if (MFI->ShaderType == 1) { 344 BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), 345 getHWInstrDesc(CF_CALL_FS)); 346 CfCount++; 347 MaxStack = 1; 348 } 349 std::vector<ClauseFile> FetchClauses, AluClauses; 350 std::vector<MachineInstr *> LastAlu(1); 351 std::vector<MachineInstr *> ToPopAfter; 352 353 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); 354 I != E;) { 355 if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) { 356 DEBUG(dbgs() << CfCount << ":"; I->dump();); 357 FetchClauses.push_back(MakeFetchClause(MBB, I)); 358 CfCount++; 359 continue; 360 } 361 362 MachineBasicBlock::iterator MI = I; 363 if (MI->getOpcode() != AMDGPU::ENDIF) 364 LastAlu.back() = 0; 365 if (MI->getOpcode() == AMDGPU::CF_ALU) 366 LastAlu.back() = MI; 367 I++; 368 switch (MI->getOpcode()) { 369 case AMDGPU::CF_ALU_PUSH_BEFORE: 370 CurrentStack++; 371 MaxStack = std::max(MaxStack, CurrentStack); 372 HasPush = true; 373 case AMDGPU::CF_ALU: 374 I = MI; 375 AluClauses.push_back(MakeALUClause(MBB, I)); 376 case AMDGPU::EG_ExportBuf: 377 case AMDGPU::EG_ExportSwz: 378 case AMDGPU::R600_ExportBuf: 379 case AMDGPU::R600_ExportSwz: 380 case AMDGPU::RAT_WRITE_CACHELESS_32_eg: 381 case AMDGPU::RAT_WRITE_CACHELESS_64_eg: 382 case AMDGPU::RAT_WRITE_CACHELESS_128_eg: 383 case AMDGPU::RAT_STORE_DWORD32_cm: 384 case AMDGPU::RAT_STORE_DWORD64_cm: 385 DEBUG(dbgs() << CfCount << ":"; MI->dump();); 386 CfCount++; 387 break; 388 case AMDGPU::WHILELOOP: { 389 CurrentStack+=4; 390 MaxStack = std::max(MaxStack, CurrentStack); 391 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 392 getHWInstrDesc(CF_WHILE_LOOP)) 393 .addImm(1); 394 std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount, 395 std::set<MachineInstr *>()); 396 Pair.second.insert(MIb); 397 LoopStack.push_back(Pair); 398 MI->eraseFromParent(); 399 CfCount++; 400 break; 401 } 402 case AMDGPU::ENDLOOP: { 403 CurrentStack-=4; 404 std::pair<unsigned, std::set<MachineInstr *> > Pair = 405 LoopStack.back(); 406 LoopStack.pop_back(); 407 CounterPropagateAddr(Pair.second, CfCount); 408 BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP)) 409 .addImm(Pair.first + 1); 410 MI->eraseFromParent(); 411 CfCount++; 412 break; 413 } 414 case AMDGPU::IF_PREDICATE_SET: { 415 LastAlu.push_back(0); 416 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 417 getHWInstrDesc(CF_JUMP)) 418 .addImm(0) 419 .addImm(0); 420 IfThenElseStack.push_back(MIb); 421 DEBUG(dbgs() << CfCount << ":"; MIb->dump();); 422 MI->eraseFromParent(); 423 CfCount++; 424 break; 425 } 426 case AMDGPU::ELSE: { 427 MachineInstr * JumpInst = IfThenElseStack.back(); 428 IfThenElseStack.pop_back(); 429 CounterPropagateAddr(JumpInst, CfCount); 430 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 431 getHWInstrDesc(CF_ELSE)) 432 .addImm(0) 433 .addImm(0); 434 DEBUG(dbgs() << CfCount << ":"; MIb->dump();); 435 IfThenElseStack.push_back(MIb); 436 MI->eraseFromParent(); 437 CfCount++; 438 break; 439 } 440 case AMDGPU::ENDIF: { 441 CurrentStack--; 442 if (LastAlu.back()) { 443 ToPopAfter.push_back(LastAlu.back()); 444 } else { 445 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 446 getHWInstrDesc(CF_POP)) 447 .addImm(CfCount + 1) 448 .addImm(1); 449 (void)MIb; 450 DEBUG(dbgs() << CfCount << ":"; MIb->dump();); 451 CfCount++; 452 } 453 454 MachineInstr *IfOrElseInst = IfThenElseStack.back(); 455 IfThenElseStack.pop_back(); 456 CounterPropagateAddr(IfOrElseInst, CfCount); 457 IfOrElseInst->getOperand(1).setImm(1); 458 LastAlu.pop_back(); 459 MI->eraseFromParent(); 460 break; 461 } 462 case AMDGPU::BREAK: { 463 CfCount ++; 464 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 465 getHWInstrDesc(CF_LOOP_BREAK)) 466 .addImm(0); 467 LoopStack.back().second.insert(MIb); 468 MI->eraseFromParent(); 469 break; 470 } 471 case AMDGPU::CONTINUE: { 472 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 473 getHWInstrDesc(CF_LOOP_CONTINUE)) 474 .addImm(0); 475 LoopStack.back().second.insert(MIb); 476 MI->eraseFromParent(); 477 CfCount++; 478 break; 479 } 480 case AMDGPU::RETURN: { 481 BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END)); 482 CfCount++; 483 MI->eraseFromParent(); 484 if (CfCount % 2) { 485 BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD)); 486 CfCount++; 487 } 488 for (unsigned i = 0, e = FetchClauses.size(); i < e; i++) 489 EmitFetchClause(I, FetchClauses[i], CfCount); 490 for (unsigned i = 0, e = AluClauses.size(); i < e; i++) 491 EmitALUClause(I, AluClauses[i], CfCount); 492 } 493 default: 494 break; 495 } 496 } 497 for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) { 498 MachineInstr *Alu = ToPopAfter[i]; 499 BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu), 500 TII->get(AMDGPU::CF_ALU_POP_AFTER)) 501 .addImm(Alu->getOperand(0).getImm()) 502 .addImm(Alu->getOperand(1).getImm()) 503 .addImm(Alu->getOperand(2).getImm()) 504 .addImm(Alu->getOperand(3).getImm()) 505 .addImm(Alu->getOperand(4).getImm()) 506 .addImm(Alu->getOperand(5).getImm()) 507 .addImm(Alu->getOperand(6).getImm()) 508 .addImm(Alu->getOperand(7).getImm()) 509 .addImm(Alu->getOperand(8).getImm()); 510 Alu->eraseFromParent(); 511 } 512 MFI->StackSize = getHWStackSize(MaxStack, HasPush); 513 } 514 515 return false; 516 } 517 518 const char *getPassName() const { 519 return "R600 Control Flow Finalizer Pass"; 520 } 521 }; 522 523 char R600ControlFlowFinalizer::ID = 0; 524 525 } // end anonymous namespace 526 527 528 llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) { 529 return new R600ControlFlowFinalizer(TM); 530 } 531