1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief This pass lowers the pseudo control flow instructions to real 12 /// machine instructions. 13 /// 14 /// All control flow is handled using predicated instructions and 15 /// a predicate stack. Each Scalar ALU controls the operations of 64 Vector 16 /// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs 17 /// by writting to the 64-bit EXEC register (each bit corresponds to a 18 /// single vector ALU). Typically, for predicates, a vector ALU will write 19 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each 20 /// Vector ALU) and then the ScalarALU will AND the VCC register with the 21 /// EXEC to update the predicates. 22 /// 23 /// For example: 24 /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 25 /// %SGPR0 = SI_IF %VCC 26 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 27 /// %SGPR0 = SI_ELSE %SGPR0 28 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 29 /// SI_END_CF %SGPR0 30 /// 31 /// becomes: 32 /// 33 /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask 34 /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask 35 /// S_CBRANCH_EXECZ label0 // This instruction is an optional 36 /// // optimization which allows us to 37 /// // branch if all the bits of 38 /// // EXEC are zero. 39 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch 40 /// 41 /// label0: 42 /// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block 43 /// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask 44 /// S_BRANCH_EXECZ label1 // Use our branch optimization 45 /// // instruction again. 46 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block 47 /// label1: 48 /// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits 49 //===----------------------------------------------------------------------===// 50 51 #include "AMDGPU.h" 52 #include "AMDGPUSubtarget.h" 53 #include "SIInstrInfo.h" 54 #include "SIMachineFunctionInfo.h" 55 #include "llvm/CodeGen/LivePhysRegs.h" 56 #include "llvm/CodeGen/MachineFrameInfo.h" 57 #include "llvm/CodeGen/MachineFunction.h" 58 #include "llvm/CodeGen/MachineFunctionPass.h" 59 #include "llvm/CodeGen/MachineInstrBuilder.h" 60 #include "llvm/CodeGen/MachineRegisterInfo.h" 61 #include "llvm/IR/Constants.h" 62 63 using namespace llvm; 64 65 #define DEBUG_TYPE "si-lower-control-flow" 66 67 namespace { 68 69 class SILowerControlFlow : public MachineFunctionPass { 70 private: 71 static const unsigned SkipThreshold = 12; 72 73 const SIRegisterInfo *TRI; 74 const SIInstrInfo *TII; 75 76 bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); 77 78 void Skip(MachineInstr &From, MachineOperand &To); 79 bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB); 80 81 void If(MachineInstr &MI); 82 void Else(MachineInstr &MI, bool ExecModified); 83 void Break(MachineInstr &MI); 84 void IfBreak(MachineInstr &MI); 85 void ElseBreak(MachineInstr &MI); 86 void Loop(MachineInstr &MI); 87 void EndCf(MachineInstr &MI); 88 89 void Kill(MachineInstr &MI); 90 void Branch(MachineInstr &MI); 91 92 MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB, 93 MachineBasicBlock::iterator I) const; 94 95 std::pair<MachineBasicBlock *, MachineBasicBlock *> 96 splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); 97 98 void splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs, 99 const MachineRegisterInfo &MRI, 100 const MachineInstr &MI, 101 MachineBasicBlock &LoopBB, 102 MachineBasicBlock &RemainderBB, 103 unsigned SaveReg, 104 const MachineOperand &IdxReg); 105 106 void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL, 107 MachineInstr *MovRel, 108 const MachineOperand &IdxReg, 109 int Offset); 110 111 bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0); 112 std::pair<unsigned, int> computeIndirectRegAndOffset(unsigned VecReg, 113 int Offset) const; 114 bool indirectSrc(MachineInstr &MI); 115 bool indirectDst(MachineInstr &MI); 116 117 public: 118 static char ID; 119 120 SILowerControlFlow() : 121 MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { } 122 123 bool runOnMachineFunction(MachineFunction &MF) override; 124 125 const char *getPassName() const override { 126 return "SI Lower control flow pseudo instructions"; 127 } 128 }; 129 130 } // End anonymous namespace 131 132 char SILowerControlFlow::ID = 0; 133 134 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE, 135 "SI lower control flow", false, false) 136 137 char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID; 138 139 140 FunctionPass *llvm::createSILowerControlFlowPass() { 141 return new SILowerControlFlow(); 142 } 143 144 static bool opcodeEmitsNoInsts(unsigned Opc) { 145 switch (Opc) { 146 case TargetOpcode::IMPLICIT_DEF: 147 case TargetOpcode::KILL: 148 case TargetOpcode::BUNDLE: 149 case TargetOpcode::CFI_INSTRUCTION: 150 case TargetOpcode::EH_LABEL: 151 case TargetOpcode::GC_LABEL: 152 case TargetOpcode::DBG_VALUE: 153 return true; 154 default: 155 return false; 156 } 157 } 158 159 bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From, 160 MachineBasicBlock *To) { 161 162 unsigned NumInstr = 0; 163 MachineFunction *MF = From->getParent(); 164 165 for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end(); 166 MBBI != End && MBBI != ToI; ++MBBI) { 167 MachineBasicBlock &MBB = *MBBI; 168 169 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); 170 NumInstr < SkipThreshold && I != E; ++I) { 171 if (opcodeEmitsNoInsts(I->getOpcode())) 172 continue; 173 174 // When a uniform loop is inside non-uniform control flow, the branch 175 // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken 176 // when EXEC = 0. We should skip the loop lest it becomes infinite. 177 if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ || 178 I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) 179 return true; 180 181 if (I->isInlineAsm()) { 182 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); 183 const char *AsmStr = I->getOperand(0).getSymbolName(); 184 185 // inlineasm length estimate is number of bytes assuming the longest 186 // instruction. 187 uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI); 188 NumInstr += MaxAsmSize / MAI->getMaxInstLength(); 189 } else { 190 ++NumInstr; 191 } 192 193 if (NumInstr >= SkipThreshold) 194 return true; 195 } 196 } 197 198 return false; 199 } 200 201 void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) { 202 203 if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB())) 204 return; 205 206 DebugLoc DL = From.getDebugLoc(); 207 BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) 208 .addOperand(To); 209 } 210 211 bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) { 212 MachineBasicBlock &MBB = *MI.getParent(); 213 MachineFunction *MF = MBB.getParent(); 214 215 if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS || 216 !shouldSkip(&MBB, &MBB.getParent()->back())) 217 return false; 218 219 MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator()); 220 SkipBB->addSuccessor(&NextBB); 221 222 const DebugLoc &DL = MI.getDebugLoc(); 223 224 // If the exec mask is non-zero, skip the next two instructions 225 BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 226 .addMBB(&NextBB); 227 228 MachineBasicBlock::iterator Insert = SkipBB->begin(); 229 230 // Exec mask is zero: Export to NULL target... 231 BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP)) 232 .addImm(0) 233 .addImm(0x09) // V_008DFC_SQ_EXP_NULL 234 .addImm(0) 235 .addImm(1) 236 .addImm(1) 237 .addReg(AMDGPU::VGPR0, RegState::Undef) 238 .addReg(AMDGPU::VGPR0, RegState::Undef) 239 .addReg(AMDGPU::VGPR0, RegState::Undef) 240 .addReg(AMDGPU::VGPR0, RegState::Undef); 241 242 // ... and terminate wavefront. 243 BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); 244 245 return true; 246 } 247 248 void SILowerControlFlow::If(MachineInstr &MI) { 249 MachineBasicBlock &MBB = *MI.getParent(); 250 DebugLoc DL = MI.getDebugLoc(); 251 unsigned Reg = MI.getOperand(0).getReg(); 252 unsigned Vcc = MI.getOperand(1).getReg(); 253 254 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg) 255 .addReg(Vcc); 256 257 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg) 258 .addReg(AMDGPU::EXEC) 259 .addReg(Reg); 260 261 Skip(MI, MI.getOperand(2)); 262 263 // Insert a pseudo terminator to help keep the verifier happy. 264 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) 265 .addOperand(MI.getOperand(2)) 266 .addReg(Reg); 267 268 MI.eraseFromParent(); 269 } 270 271 void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) { 272 MachineBasicBlock &MBB = *MI.getParent(); 273 DebugLoc DL = MI.getDebugLoc(); 274 unsigned Dst = MI.getOperand(0).getReg(); 275 unsigned Src = MI.getOperand(1).getReg(); 276 277 BuildMI(MBB, MBB.getFirstNonPHI(), DL, 278 TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) 279 .addReg(Src); // Saved EXEC 280 281 if (ExecModified) { 282 // Adjust the saved exec to account for the modifications during the flow 283 // block that contains the ELSE. This can happen when WQM mode is switched 284 // off. 285 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst) 286 .addReg(AMDGPU::EXEC) 287 .addReg(Dst); 288 } 289 290 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) 291 .addReg(AMDGPU::EXEC) 292 .addReg(Dst); 293 294 Skip(MI, MI.getOperand(2)); 295 296 // Insert a pseudo terminator to help keep the verifier happy. 297 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) 298 .addOperand(MI.getOperand(2)) 299 .addReg(Dst); 300 301 MI.eraseFromParent(); 302 } 303 304 void SILowerControlFlow::Break(MachineInstr &MI) { 305 MachineBasicBlock &MBB = *MI.getParent(); 306 DebugLoc DL = MI.getDebugLoc(); 307 308 unsigned Dst = MI.getOperand(0).getReg(); 309 unsigned Src = MI.getOperand(1).getReg(); 310 311 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) 312 .addReg(AMDGPU::EXEC) 313 .addReg(Src); 314 315 MI.eraseFromParent(); 316 } 317 318 void SILowerControlFlow::IfBreak(MachineInstr &MI) { 319 MachineBasicBlock &MBB = *MI.getParent(); 320 DebugLoc DL = MI.getDebugLoc(); 321 322 unsigned Dst = MI.getOperand(0).getReg(); 323 unsigned Vcc = MI.getOperand(1).getReg(); 324 unsigned Src = MI.getOperand(2).getReg(); 325 326 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) 327 .addReg(Vcc) 328 .addReg(Src); 329 330 MI.eraseFromParent(); 331 } 332 333 void SILowerControlFlow::ElseBreak(MachineInstr &MI) { 334 MachineBasicBlock &MBB = *MI.getParent(); 335 DebugLoc DL = MI.getDebugLoc(); 336 337 unsigned Dst = MI.getOperand(0).getReg(); 338 unsigned Saved = MI.getOperand(1).getReg(); 339 unsigned Src = MI.getOperand(2).getReg(); 340 341 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) 342 .addReg(Saved) 343 .addReg(Src); 344 345 MI.eraseFromParent(); 346 } 347 348 void SILowerControlFlow::Loop(MachineInstr &MI) { 349 MachineBasicBlock &MBB = *MI.getParent(); 350 DebugLoc DL = MI.getDebugLoc(); 351 unsigned Src = MI.getOperand(0).getReg(); 352 353 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC) 354 .addReg(AMDGPU::EXEC) 355 .addReg(Src); 356 357 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 358 .addOperand(MI.getOperand(1)); 359 360 MI.eraseFromParent(); 361 } 362 363 void SILowerControlFlow::EndCf(MachineInstr &MI) { 364 MachineBasicBlock &MBB = *MI.getParent(); 365 DebugLoc DL = MI.getDebugLoc(); 366 unsigned Reg = MI.getOperand(0).getReg(); 367 368 BuildMI(MBB, MBB.getFirstNonPHI(), DL, 369 TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) 370 .addReg(AMDGPU::EXEC) 371 .addReg(Reg); 372 373 MI.eraseFromParent(); 374 } 375 376 void SILowerControlFlow::Branch(MachineInstr &MI) { 377 MachineBasicBlock *MBB = MI.getOperand(0).getMBB(); 378 if (MBB == MI.getParent()->getNextNode()) 379 MI.eraseFromParent(); 380 381 // If these aren't equal, this is probably an infinite loop. 382 } 383 384 void SILowerControlFlow::Kill(MachineInstr &MI) { 385 MachineBasicBlock &MBB = *MI.getParent(); 386 DebugLoc DL = MI.getDebugLoc(); 387 const MachineOperand &Op = MI.getOperand(0); 388 389 #ifndef NDEBUG 390 CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv(); 391 // Kill is only allowed in pixel / geometry shaders. 392 assert(CallConv == CallingConv::AMDGPU_PS || 393 CallConv == CallingConv::AMDGPU_GS); 394 #endif 395 396 // Clear this thread from the exec mask if the operand is negative 397 if ((Op.isImm())) { 398 // Constant operand: Set exec mask to 0 or do nothing 399 if (Op.getImm() & 0x80000000) { 400 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) 401 .addImm(0); 402 } 403 } else { 404 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32)) 405 .addImm(0) 406 .addOperand(Op); 407 } 408 409 MI.eraseFromParent(); 410 } 411 412 // All currently live registers must remain so in the remainder block. 413 void SILowerControlFlow::splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs, 414 const MachineRegisterInfo &MRI, 415 const MachineInstr &MI, 416 MachineBasicBlock &LoopBB, 417 MachineBasicBlock &RemainderBB, 418 unsigned SaveReg, 419 const MachineOperand &IdxReg) { 420 // Add reg defined in loop body. 421 RemainderLiveRegs.addReg(SaveReg); 422 423 if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) { 424 if (!Val->isUndef()) { 425 RemainderLiveRegs.addReg(Val->getReg()); 426 LoopBB.addLiveIn(Val->getReg()); 427 } 428 } 429 430 for (unsigned Reg : RemainderLiveRegs) { 431 if (MRI.isAllocatable(Reg)) 432 RemainderBB.addLiveIn(Reg); 433 } 434 435 const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src); 436 if (!Src->isUndef()) 437 LoopBB.addLiveIn(Src->getReg()); 438 439 if (!IdxReg.isUndef()) 440 LoopBB.addLiveIn(IdxReg.getReg()); 441 LoopBB.sortUniqueLiveIns(); 442 } 443 444 void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, 445 DebugLoc DL, 446 MachineInstr *MovRel, 447 const MachineOperand &IdxReg, 448 int Offset) { 449 MachineBasicBlock::iterator I = LoopBB.begin(); 450 451 // Read the next variant into VCC (lower 32 bits) <- also loop target 452 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO) 453 .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); 454 455 // Move index from VCC into M0 456 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 457 .addReg(AMDGPU::VCC_LO); 458 459 // Compare the just read M0 value to all possible Idx values 460 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32)) 461 .addReg(AMDGPU::M0) 462 .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); 463 464 // Update EXEC, save the original EXEC value to VCC 465 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) 466 .addReg(AMDGPU::VCC); 467 468 if (Offset != 0) { 469 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) 470 .addReg(AMDGPU::M0) 471 .addImm(Offset); 472 } 473 474 // Do the actual move 475 LoopBB.insert(I, MovRel); 476 477 // Update EXEC, switch all done bits to 0 and all todo bits to 1 478 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) 479 .addReg(AMDGPU::EXEC) 480 .addReg(AMDGPU::VCC); 481 482 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover 483 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 484 .addMBB(&LoopBB); 485 } 486 487 MachineBasicBlock *SILowerControlFlow::insertSkipBlock( 488 MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { 489 MachineFunction *MF = MBB.getParent(); 490 491 MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock(); 492 MachineFunction::iterator MBBI(MBB); 493 ++MBBI; 494 495 MF->insert(MBBI, SkipBB); 496 MBB.addSuccessor(SkipBB); 497 498 return SkipBB; 499 } 500 501 std::pair<MachineBasicBlock *, MachineBasicBlock *> 502 SILowerControlFlow::splitBlock(MachineBasicBlock &MBB, 503 MachineBasicBlock::iterator I) { 504 MachineFunction *MF = MBB.getParent(); 505 506 // To insert the loop we need to split the block. Move everything after this 507 // point to a new block, and insert a new empty block between the two. 508 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 509 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 510 MachineFunction::iterator MBBI(MBB); 511 ++MBBI; 512 513 MF->insert(MBBI, LoopBB); 514 MF->insert(MBBI, RemainderBB); 515 516 // Move the rest of the block into a new block. 517 RemainderBB->transferSuccessors(&MBB); 518 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); 519 520 MBB.addSuccessor(LoopBB); 521 522 return std::make_pair(LoopBB, RemainderBB); 523 } 524 525 // Returns true if a new block was inserted. 526 bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) { 527 MachineBasicBlock &MBB = *MI.getParent(); 528 DebugLoc DL = MI.getDebugLoc(); 529 MachineBasicBlock::iterator I(&MI); 530 531 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 532 533 if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) { 534 if (Offset != 0) { 535 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) 536 .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef())) 537 .addImm(Offset); 538 } else { 539 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 540 .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef())); 541 } 542 543 MBB.insert(I, MovRel); 544 MI.eraseFromParent(); 545 return false; 546 } 547 548 MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 549 SaveOp->setIsDead(false); 550 unsigned Save = SaveOp->getReg(); 551 552 // Reading from a VGPR requires looping over all workitems in the wavefront. 553 assert(AMDGPU::SReg_64RegClass.contains(Save) && 554 AMDGPU::VGPR_32RegClass.contains(Idx->getReg())); 555 556 // Save the EXEC mask 557 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save) 558 .addReg(AMDGPU::EXEC); 559 560 LivePhysRegs RemainderLiveRegs(TRI); 561 562 RemainderLiveRegs.addLiveOuts(MBB); 563 564 MachineBasicBlock *LoopBB; 565 MachineBasicBlock *RemainderBB; 566 567 std::tie(LoopBB, RemainderBB) = splitBlock(MBB, I); 568 569 for (const MachineInstr &Inst : reverse(*RemainderBB)) 570 RemainderLiveRegs.stepBackward(Inst); 571 572 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 573 LoopBB->addSuccessor(RemainderBB); 574 LoopBB->addSuccessor(LoopBB); 575 576 splitLoadM0BlockLiveIns(RemainderLiveRegs, MRI, MI, *LoopBB, 577 *RemainderBB, Save, *Idx); 578 579 emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset); 580 581 MachineBasicBlock::iterator First = RemainderBB->begin(); 582 BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) 583 .addReg(Save); 584 585 MI.eraseFromParent(); 586 return true; 587 } 588 589 /// \param @VecReg The register which holds element zero of the vector being 590 /// addressed into. 591 // 592 /// \param[in] @Idx The index operand from the movrel instruction. This must be 593 // a register, but may be NoRegister. 594 /// 595 /// \param[in] @Offset As an input, this is the constant offset part of the 596 // indirect Index. e.g. v0 = v[VecReg + Offset] As an output, this is a constant 597 // value that needs to be added to the value stored in M0. 598 std::pair<unsigned, int> 599 SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, int Offset) const { 600 unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0); 601 if (!SubReg) 602 SubReg = VecReg; 603 604 const TargetRegisterClass *SuperRC = TRI->getPhysRegClass(VecReg); 605 const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg); 606 int NumElts = SuperRC->getSize() / RC->getSize(); 607 608 int BaseRegIdx = TRI->getHWRegIndex(SubReg); 609 610 // Skip out of bounds offsets, or else we would end up using an undefined 611 // register. 612 if (Offset >= NumElts) 613 return std::make_pair(RC->getRegister(BaseRegIdx), Offset); 614 615 int RegIdx = BaseRegIdx + Offset; 616 if (RegIdx < 0) { 617 Offset = RegIdx; 618 RegIdx = 0; 619 } else { 620 Offset = 0; 621 } 622 623 unsigned Reg = RC->getRegister(RegIdx); 624 return std::make_pair(Reg, Offset); 625 } 626 627 // Return true if a new block was inserted. 628 bool SILowerControlFlow::indirectSrc(MachineInstr &MI) { 629 MachineBasicBlock &MBB = *MI.getParent(); 630 const DebugLoc &DL = MI.getDebugLoc(); 631 632 unsigned Dst = MI.getOperand(0).getReg(); 633 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); 634 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); 635 unsigned Reg; 636 637 std::tie(Reg, Offset) = computeIndirectRegAndOffset(SrcVec->getReg(), Offset); 638 639 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 640 if (Idx->getReg() == AMDGPU::NoRegister) { 641 // Only had a constant offset, copy the register directly. 642 BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) 643 .addReg(Reg, getUndefRegState(SrcVec->isUndef())); 644 MI.eraseFromParent(); 645 return false; 646 } 647 648 MachineInstr *MovRel = 649 BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) 650 .addReg(Reg, getUndefRegState(SrcVec->isUndef())) 651 .addReg(SrcVec->getReg(), RegState::Implicit); 652 653 return loadM0(MI, MovRel, Offset); 654 } 655 656 // Return true if a new block was inserted. 657 bool SILowerControlFlow::indirectDst(MachineInstr &MI) { 658 MachineBasicBlock &MBB = *MI.getParent(); 659 const DebugLoc &DL = MI.getDebugLoc(); 660 661 unsigned Dst = MI.getOperand(0).getReg(); 662 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); 663 unsigned Reg; 664 665 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); 666 std::tie(Reg, Offset) = computeIndirectRegAndOffset(Dst, Offset); 667 668 MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 669 if (Idx->getReg() == AMDGPU::NoRegister) { 670 // Only had a constant offset, copy the register directly. 671 BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Reg) 672 .addOperand(*Val); 673 MI.eraseFromParent(); 674 return false; 675 } 676 677 MachineInstr *MovRel = 678 BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32), Reg) 679 .addReg(Val->getReg(), getUndefRegState(Val->isUndef())) 680 .addReg(Dst, RegState::Implicit); 681 682 return loadM0(MI, MovRel, Offset); 683 } 684 685 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { 686 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 687 TII = ST.getInstrInfo(); 688 TRI = &TII->getRegisterInfo(); 689 690 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 691 692 bool HaveKill = false; 693 bool NeedFlat = false; 694 unsigned Depth = 0; 695 696 MachineFunction::iterator NextBB; 697 698 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 699 BI != BE; BI = NextBB) { 700 NextBB = std::next(BI); 701 MachineBasicBlock &MBB = *BI; 702 703 MachineBasicBlock *EmptyMBBAtEnd = nullptr; 704 MachineBasicBlock::iterator I, Next; 705 bool ExecModified = false; 706 707 for (I = MBB.begin(); I != MBB.end(); I = Next) { 708 Next = std::next(I); 709 710 MachineInstr &MI = *I; 711 712 // Flat uses m0 in case it needs to access LDS. 713 if (TII->isFLAT(MI)) 714 NeedFlat = true; 715 716 if (I->modifiesRegister(AMDGPU::EXEC, TRI)) 717 ExecModified = true; 718 719 switch (MI.getOpcode()) { 720 default: break; 721 case AMDGPU::SI_IF: 722 ++Depth; 723 If(MI); 724 break; 725 726 case AMDGPU::SI_ELSE: 727 Else(MI, ExecModified); 728 break; 729 730 case AMDGPU::SI_BREAK: 731 Break(MI); 732 break; 733 734 case AMDGPU::SI_IF_BREAK: 735 IfBreak(MI); 736 break; 737 738 case AMDGPU::SI_ELSE_BREAK: 739 ElseBreak(MI); 740 break; 741 742 case AMDGPU::SI_LOOP: 743 ++Depth; 744 Loop(MI); 745 break; 746 747 case AMDGPU::SI_END_CF: 748 if (--Depth == 0 && HaveKill) { 749 HaveKill = false; 750 751 if (skipIfDead(MI, *NextBB)) { 752 NextBB = std::next(BI); 753 BE = MF.end(); 754 Next = MBB.end(); 755 } 756 } 757 EndCf(MI); 758 break; 759 760 case AMDGPU::SI_KILL_TERMINATOR: 761 if (Depth == 0) { 762 if (skipIfDead(MI, *NextBB)) { 763 NextBB = std::next(BI); 764 BE = MF.end(); 765 Next = MBB.end(); 766 } 767 } else 768 HaveKill = true; 769 Kill(MI); 770 break; 771 772 case AMDGPU::S_BRANCH: 773 Branch(MI); 774 break; 775 776 case AMDGPU::SI_INDIRECT_SRC_V1: 777 case AMDGPU::SI_INDIRECT_SRC_V2: 778 case AMDGPU::SI_INDIRECT_SRC_V4: 779 case AMDGPU::SI_INDIRECT_SRC_V8: 780 case AMDGPU::SI_INDIRECT_SRC_V16: 781 if (indirectSrc(MI)) { 782 // The block was split at this point. We can safely skip the middle 783 // inserted block to the following which contains the rest of this 784 // block's instructions. 785 NextBB = std::next(BI); 786 BE = MF.end(); 787 Next = MBB.end(); 788 } 789 790 break; 791 792 case AMDGPU::SI_INDIRECT_DST_V1: 793 case AMDGPU::SI_INDIRECT_DST_V2: 794 case AMDGPU::SI_INDIRECT_DST_V4: 795 case AMDGPU::SI_INDIRECT_DST_V8: 796 case AMDGPU::SI_INDIRECT_DST_V16: 797 if (indirectDst(MI)) { 798 // The block was split at this point. We can safely skip the middle 799 // inserted block to the following which contains the rest of this 800 // block's instructions. 801 NextBB = std::next(BI); 802 BE = MF.end(); 803 Next = MBB.end(); 804 } 805 806 break; 807 808 case AMDGPU::SI_RETURN: { 809 assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid()); 810 811 // Graphics shaders returning non-void shouldn't contain S_ENDPGM, 812 // because external bytecode will be appended at the end. 813 if (BI != --MF.end() || I != MBB.getFirstTerminator()) { 814 // SI_RETURN is not the last instruction. Add an empty block at 815 // the end and jump there. 816 if (!EmptyMBBAtEnd) { 817 EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); 818 MF.insert(MF.end(), EmptyMBBAtEnd); 819 } 820 821 MBB.addSuccessor(EmptyMBBAtEnd); 822 BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) 823 .addMBB(EmptyMBBAtEnd); 824 I->eraseFromParent(); 825 } 826 break; 827 } 828 } 829 } 830 } 831 832 if (NeedFlat && MFI->IsKernel) { 833 // TODO: What to use with function calls? 834 // We will need to Initialize the flat scratch register pair. 835 if (NeedFlat) 836 MFI->setHasFlatInstructions(true); 837 } 838 839 return true; 840 } 841