Home | History | Annotate | Download | only in AMDGPU
      1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 /// \brief This pass lowers the pseudo control flow instructions to real
     12 /// machine instructions.
     13 ///
     14 /// All control flow is handled using predicated instructions and
     15 /// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
     16 /// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
     17 /// by writting to the 64-bit EXEC register (each bit corresponds to a
     18 /// single vector ALU).  Typically, for predicates, a vector ALU will write
     19 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
     20 /// Vector ALU) and then the ScalarALU will AND the VCC register with the
     21 /// EXEC to update the predicates.
     22 ///
     23 /// For example:
     24 /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
     25 /// %SGPR0 = SI_IF %VCC
     26 ///   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
     27 /// %SGPR0 = SI_ELSE %SGPR0
     28 ///   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
     29 /// SI_END_CF %SGPR0
     30 ///
     31 /// becomes:
     32 ///
     33 /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC  // Save and update the exec mask
     34 /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
     35 /// S_CBRANCH_EXECZ label0            // This instruction is an optional
     36 ///                                   // optimization which allows us to
     37 ///                                   // branch if all the bits of
     38 ///                                   // EXEC are zero.
     39 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
     40 ///
     41 /// label0:
     42 /// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC   // Restore the exec mask for the Then block
     43 /// %EXEC = S_XOR_B64 %SGPR0, %EXEC    // Clear live bits from saved exec mask
     44 /// S_BRANCH_EXECZ label1              // Use our branch optimization
     45 ///                                    // instruction again.
     46 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
     47 /// label1:
     48 /// %EXEC = S_OR_B64 %EXEC, %SGPR0     // Re-enable saved exec mask bits
     49 //===----------------------------------------------------------------------===//
     50 
     51 #include "AMDGPU.h"
     52 #include "AMDGPUSubtarget.h"
     53 #include "SIInstrInfo.h"
     54 #include "SIMachineFunctionInfo.h"
     55 #include "llvm/CodeGen/LivePhysRegs.h"
     56 #include "llvm/CodeGen/MachineFrameInfo.h"
     57 #include "llvm/CodeGen/MachineFunction.h"
     58 #include "llvm/CodeGen/MachineFunctionPass.h"
     59 #include "llvm/CodeGen/MachineInstrBuilder.h"
     60 #include "llvm/CodeGen/MachineRegisterInfo.h"
     61 #include "llvm/IR/Constants.h"
     62 
     63 using namespace llvm;
     64 
     65 #define DEBUG_TYPE "si-lower-control-flow"
     66 
     67 namespace {
     68 
     69 class SILowerControlFlow : public MachineFunctionPass {
     70 private:
     71   static const unsigned SkipThreshold = 12;
     72 
     73   const SIRegisterInfo *TRI;
     74   const SIInstrInfo *TII;
     75 
     76   bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
     77 
     78   void Skip(MachineInstr &From, MachineOperand &To);
     79   bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
     80 
     81   void If(MachineInstr &MI);
     82   void Else(MachineInstr &MI, bool ExecModified);
     83   void Break(MachineInstr &MI);
     84   void IfBreak(MachineInstr &MI);
     85   void ElseBreak(MachineInstr &MI);
     86   void Loop(MachineInstr &MI);
     87   void EndCf(MachineInstr &MI);
     88 
     89   void Kill(MachineInstr &MI);
     90   void Branch(MachineInstr &MI);
     91 
     92   MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
     93                                      MachineBasicBlock::iterator I) const;
     94 
     95   std::pair<MachineBasicBlock *, MachineBasicBlock *>
     96   splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
     97 
     98   void splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs,
     99                                const MachineRegisterInfo &MRI,
    100                                const MachineInstr &MI,
    101                                MachineBasicBlock &LoopBB,
    102                                MachineBasicBlock &RemainderBB,
    103                                unsigned SaveReg,
    104                                const MachineOperand &IdxReg);
    105 
    106   void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL,
    107                               MachineInstr *MovRel,
    108                               const MachineOperand &IdxReg,
    109                               int Offset);
    110 
    111   bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
    112   std::pair<unsigned, int> computeIndirectRegAndOffset(unsigned VecReg,
    113                                                        int Offset) const;
    114   bool indirectSrc(MachineInstr &MI);
    115   bool indirectDst(MachineInstr &MI);
    116 
    117 public:
    118   static char ID;
    119 
    120   SILowerControlFlow() :
    121     MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
    122 
    123   bool runOnMachineFunction(MachineFunction &MF) override;
    124 
    125   const char *getPassName() const override {
    126     return "SI Lower control flow pseudo instructions";
    127   }
    128 };
    129 
    130 } // End anonymous namespace
    131 
    132 char SILowerControlFlow::ID = 0;
    133 
    134 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
    135                 "SI lower control flow", false, false)
    136 
    137 char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID;
    138 
    139 
    140 FunctionPass *llvm::createSILowerControlFlowPass() {
    141   return new SILowerControlFlow();
    142 }
    143 
    144 static bool opcodeEmitsNoInsts(unsigned Opc) {
    145   switch (Opc) {
    146   case TargetOpcode::IMPLICIT_DEF:
    147   case TargetOpcode::KILL:
    148   case TargetOpcode::BUNDLE:
    149   case TargetOpcode::CFI_INSTRUCTION:
    150   case TargetOpcode::EH_LABEL:
    151   case TargetOpcode::GC_LABEL:
    152   case TargetOpcode::DBG_VALUE:
    153     return true;
    154   default:
    155     return false;
    156   }
    157 }
    158 
    159 bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
    160                                     MachineBasicBlock *To) {
    161 
    162   unsigned NumInstr = 0;
    163   MachineFunction *MF = From->getParent();
    164 
    165   for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end();
    166        MBBI != End && MBBI != ToI; ++MBBI) {
    167     MachineBasicBlock &MBB = *MBBI;
    168 
    169     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
    170          NumInstr < SkipThreshold && I != E; ++I) {
    171       if (opcodeEmitsNoInsts(I->getOpcode()))
    172         continue;
    173 
    174       // When a uniform loop is inside non-uniform control flow, the branch
    175       // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
    176       // when EXEC = 0. We should skip the loop lest it becomes infinite.
    177       if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
    178           I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
    179         return true;
    180 
    181       if (I->isInlineAsm()) {
    182         const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
    183         const char *AsmStr = I->getOperand(0).getSymbolName();
    184 
    185         // inlineasm length estimate is number of bytes assuming the longest
    186         // instruction.
    187         uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
    188         NumInstr += MaxAsmSize / MAI->getMaxInstLength();
    189       } else {
    190         ++NumInstr;
    191       }
    192 
    193       if (NumInstr >= SkipThreshold)
    194         return true;
    195     }
    196   }
    197 
    198   return false;
    199 }
    200 
    201 void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
    202 
    203   if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
    204     return;
    205 
    206   DebugLoc DL = From.getDebugLoc();
    207   BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
    208     .addOperand(To);
    209 }
    210 
    211 bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
    212   MachineBasicBlock &MBB = *MI.getParent();
    213   MachineFunction *MF = MBB.getParent();
    214 
    215   if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
    216       !shouldSkip(&MBB, &MBB.getParent()->back()))
    217     return false;
    218 
    219   MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
    220   SkipBB->addSuccessor(&NextBB);
    221 
    222   const DebugLoc &DL = MI.getDebugLoc();
    223 
    224   // If the exec mask is non-zero, skip the next two instructions
    225   BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
    226     .addMBB(&NextBB);
    227 
    228   MachineBasicBlock::iterator Insert = SkipBB->begin();
    229 
    230   // Exec mask is zero: Export to NULL target...
    231   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
    232     .addImm(0)
    233     .addImm(0x09) // V_008DFC_SQ_EXP_NULL
    234     .addImm(0)
    235     .addImm(1)
    236     .addImm(1)
    237     .addReg(AMDGPU::VGPR0, RegState::Undef)
    238     .addReg(AMDGPU::VGPR0, RegState::Undef)
    239     .addReg(AMDGPU::VGPR0, RegState::Undef)
    240     .addReg(AMDGPU::VGPR0, RegState::Undef);
    241 
    242   // ... and terminate wavefront.
    243   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
    244 
    245   return true;
    246 }
    247 
    248 void SILowerControlFlow::If(MachineInstr &MI) {
    249   MachineBasicBlock &MBB = *MI.getParent();
    250   DebugLoc DL = MI.getDebugLoc();
    251   unsigned Reg = MI.getOperand(0).getReg();
    252   unsigned Vcc = MI.getOperand(1).getReg();
    253 
    254   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
    255           .addReg(Vcc);
    256 
    257   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
    258           .addReg(AMDGPU::EXEC)
    259           .addReg(Reg);
    260 
    261   Skip(MI, MI.getOperand(2));
    262 
    263   // Insert a pseudo terminator to help keep the verifier happy.
    264   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
    265     .addOperand(MI.getOperand(2))
    266     .addReg(Reg);
    267 
    268   MI.eraseFromParent();
    269 }
    270 
    271 void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
    272   MachineBasicBlock &MBB = *MI.getParent();
    273   DebugLoc DL = MI.getDebugLoc();
    274   unsigned Dst = MI.getOperand(0).getReg();
    275   unsigned Src = MI.getOperand(1).getReg();
    276 
    277   BuildMI(MBB, MBB.getFirstNonPHI(), DL,
    278           TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
    279           .addReg(Src); // Saved EXEC
    280 
    281   if (ExecModified) {
    282     // Adjust the saved exec to account for the modifications during the flow
    283     // block that contains the ELSE. This can happen when WQM mode is switched
    284     // off.
    285     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
    286             .addReg(AMDGPU::EXEC)
    287             .addReg(Dst);
    288   }
    289 
    290   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
    291           .addReg(AMDGPU::EXEC)
    292           .addReg(Dst);
    293 
    294   Skip(MI, MI.getOperand(2));
    295 
    296   // Insert a pseudo terminator to help keep the verifier happy.
    297   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
    298     .addOperand(MI.getOperand(2))
    299     .addReg(Dst);
    300 
    301   MI.eraseFromParent();
    302 }
    303 
    304 void SILowerControlFlow::Break(MachineInstr &MI) {
    305   MachineBasicBlock &MBB = *MI.getParent();
    306   DebugLoc DL = MI.getDebugLoc();
    307 
    308   unsigned Dst = MI.getOperand(0).getReg();
    309   unsigned Src = MI.getOperand(1).getReg();
    310 
    311   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
    312           .addReg(AMDGPU::EXEC)
    313           .addReg(Src);
    314 
    315   MI.eraseFromParent();
    316 }
    317 
    318 void SILowerControlFlow::IfBreak(MachineInstr &MI) {
    319   MachineBasicBlock &MBB = *MI.getParent();
    320   DebugLoc DL = MI.getDebugLoc();
    321 
    322   unsigned Dst = MI.getOperand(0).getReg();
    323   unsigned Vcc = MI.getOperand(1).getReg();
    324   unsigned Src = MI.getOperand(2).getReg();
    325 
    326   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
    327           .addReg(Vcc)
    328           .addReg(Src);
    329 
    330   MI.eraseFromParent();
    331 }
    332 
    333 void SILowerControlFlow::ElseBreak(MachineInstr &MI) {
    334   MachineBasicBlock &MBB = *MI.getParent();
    335   DebugLoc DL = MI.getDebugLoc();
    336 
    337   unsigned Dst = MI.getOperand(0).getReg();
    338   unsigned Saved = MI.getOperand(1).getReg();
    339   unsigned Src = MI.getOperand(2).getReg();
    340 
    341   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
    342           .addReg(Saved)
    343           .addReg(Src);
    344 
    345   MI.eraseFromParent();
    346 }
    347 
    348 void SILowerControlFlow::Loop(MachineInstr &MI) {
    349   MachineBasicBlock &MBB = *MI.getParent();
    350   DebugLoc DL = MI.getDebugLoc();
    351   unsigned Src = MI.getOperand(0).getReg();
    352 
    353   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
    354           .addReg(AMDGPU::EXEC)
    355           .addReg(Src);
    356 
    357   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
    358     .addOperand(MI.getOperand(1));
    359 
    360   MI.eraseFromParent();
    361 }
    362 
    363 void SILowerControlFlow::EndCf(MachineInstr &MI) {
    364   MachineBasicBlock &MBB = *MI.getParent();
    365   DebugLoc DL = MI.getDebugLoc();
    366   unsigned Reg = MI.getOperand(0).getReg();
    367 
    368   BuildMI(MBB, MBB.getFirstNonPHI(), DL,
    369           TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
    370           .addReg(AMDGPU::EXEC)
    371           .addReg(Reg);
    372 
    373   MI.eraseFromParent();
    374 }
    375 
    376 void SILowerControlFlow::Branch(MachineInstr &MI) {
    377   MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
    378   if (MBB == MI.getParent()->getNextNode())
    379     MI.eraseFromParent();
    380 
    381   // If these aren't equal, this is probably an infinite loop.
    382 }
    383 
    384 void SILowerControlFlow::Kill(MachineInstr &MI) {
    385   MachineBasicBlock &MBB = *MI.getParent();
    386   DebugLoc DL = MI.getDebugLoc();
    387   const MachineOperand &Op = MI.getOperand(0);
    388 
    389 #ifndef NDEBUG
    390   CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
    391   // Kill is only allowed in pixel / geometry shaders.
    392   assert(CallConv == CallingConv::AMDGPU_PS ||
    393          CallConv == CallingConv::AMDGPU_GS);
    394 #endif
    395 
    396   // Clear this thread from the exec mask if the operand is negative
    397   if ((Op.isImm())) {
    398     // Constant operand: Set exec mask to 0 or do nothing
    399     if (Op.getImm() & 0x80000000) {
    400       BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
    401               .addImm(0);
    402     }
    403   } else {
    404     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
    405            .addImm(0)
    406            .addOperand(Op);
    407   }
    408 
    409   MI.eraseFromParent();
    410 }
    411 
    412 // All currently live registers must remain so in the remainder block.
    413 void SILowerControlFlow::splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs,
    414                                                  const MachineRegisterInfo &MRI,
    415                                                  const MachineInstr &MI,
    416                                                  MachineBasicBlock &LoopBB,
    417                                                  MachineBasicBlock &RemainderBB,
    418                                                  unsigned SaveReg,
    419                                                  const MachineOperand &IdxReg) {
    420   // Add reg defined in loop body.
    421   RemainderLiveRegs.addReg(SaveReg);
    422 
    423   if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) {
    424     if (!Val->isUndef()) {
    425       RemainderLiveRegs.addReg(Val->getReg());
    426       LoopBB.addLiveIn(Val->getReg());
    427     }
    428   }
    429 
    430   for (unsigned Reg : RemainderLiveRegs) {
    431     if (MRI.isAllocatable(Reg))
    432       RemainderBB.addLiveIn(Reg);
    433   }
    434 
    435   const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src);
    436   if (!Src->isUndef())
    437     LoopBB.addLiveIn(Src->getReg());
    438 
    439   if (!IdxReg.isUndef())
    440     LoopBB.addLiveIn(IdxReg.getReg());
    441   LoopBB.sortUniqueLiveIns();
    442 }
    443 
    444 void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB,
    445                                                 DebugLoc DL,
    446                                                 MachineInstr *MovRel,
    447                                                 const MachineOperand &IdxReg,
    448                                                 int Offset) {
    449   MachineBasicBlock::iterator I = LoopBB.begin();
    450 
    451   // Read the next variant into VCC (lower 32 bits) <- also loop target
    452   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO)
    453     .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
    454 
    455   // Move index from VCC into M0
    456   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
    457     .addReg(AMDGPU::VCC_LO);
    458 
    459   // Compare the just read M0 value to all possible Idx values
    460   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
    461     .addReg(AMDGPU::M0)
    462     .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
    463 
    464   // Update EXEC, save the original EXEC value to VCC
    465   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
    466     .addReg(AMDGPU::VCC);
    467 
    468   if (Offset != 0) {
    469     BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
    470       .addReg(AMDGPU::M0)
    471       .addImm(Offset);
    472   }
    473 
    474   // Do the actual move
    475   LoopBB.insert(I, MovRel);
    476 
    477   // Update EXEC, switch all done bits to 0 and all todo bits to 1
    478   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
    479     .addReg(AMDGPU::EXEC)
    480     .addReg(AMDGPU::VCC);
    481 
    482   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
    483   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
    484     .addMBB(&LoopBB);
    485 }
    486 
    487 MachineBasicBlock *SILowerControlFlow::insertSkipBlock(
    488   MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
    489   MachineFunction *MF = MBB.getParent();
    490 
    491   MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
    492   MachineFunction::iterator MBBI(MBB);
    493   ++MBBI;
    494 
    495   MF->insert(MBBI, SkipBB);
    496   MBB.addSuccessor(SkipBB);
    497 
    498   return SkipBB;
    499 }
    500 
    501 std::pair<MachineBasicBlock *, MachineBasicBlock *>
    502 SILowerControlFlow::splitBlock(MachineBasicBlock &MBB,
    503                                MachineBasicBlock::iterator I) {
    504   MachineFunction *MF = MBB.getParent();
    505 
    506   // To insert the loop we need to split the block. Move everything after this
    507   // point to a new block, and insert a new empty block between the two.
    508   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
    509   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
    510   MachineFunction::iterator MBBI(MBB);
    511   ++MBBI;
    512 
    513   MF->insert(MBBI, LoopBB);
    514   MF->insert(MBBI, RemainderBB);
    515 
    516   // Move the rest of the block into a new block.
    517   RemainderBB->transferSuccessors(&MBB);
    518   RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
    519 
    520   MBB.addSuccessor(LoopBB);
    521 
    522   return std::make_pair(LoopBB, RemainderBB);
    523 }
    524 
    525 // Returns true if a new block was inserted.
    526 bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
    527   MachineBasicBlock &MBB = *MI.getParent();
    528   DebugLoc DL = MI.getDebugLoc();
    529   MachineBasicBlock::iterator I(&MI);
    530 
    531   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
    532 
    533   if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) {
    534     if (Offset != 0) {
    535       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
    536         .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()))
    537         .addImm(Offset);
    538     } else {
    539       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
    540         .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()));
    541     }
    542 
    543     MBB.insert(I, MovRel);
    544     MI.eraseFromParent();
    545     return false;
    546   }
    547 
    548   MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
    549   SaveOp->setIsDead(false);
    550   unsigned Save = SaveOp->getReg();
    551 
    552   // Reading from a VGPR requires looping over all workitems in the wavefront.
    553   assert(AMDGPU::SReg_64RegClass.contains(Save) &&
    554          AMDGPU::VGPR_32RegClass.contains(Idx->getReg()));
    555 
    556   // Save the EXEC mask
    557   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save)
    558     .addReg(AMDGPU::EXEC);
    559 
    560   LivePhysRegs RemainderLiveRegs(TRI);
    561 
    562   RemainderLiveRegs.addLiveOuts(MBB);
    563 
    564   MachineBasicBlock *LoopBB;
    565   MachineBasicBlock *RemainderBB;
    566 
    567   std::tie(LoopBB, RemainderBB) = splitBlock(MBB, I);
    568 
    569   for (const MachineInstr &Inst : reverse(*RemainderBB))
    570     RemainderLiveRegs.stepBackward(Inst);
    571 
    572   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    573   LoopBB->addSuccessor(RemainderBB);
    574   LoopBB->addSuccessor(LoopBB);
    575 
    576   splitLoadM0BlockLiveIns(RemainderLiveRegs, MRI, MI, *LoopBB,
    577                           *RemainderBB, Save, *Idx);
    578 
    579   emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset);
    580 
    581   MachineBasicBlock::iterator First = RemainderBB->begin();
    582   BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
    583     .addReg(Save);
    584 
    585   MI.eraseFromParent();
    586   return true;
    587 }
    588 
    589 /// \param @VecReg The register which holds element zero of the vector being
    590 ///                 addressed into.
    591 //
    592 /// \param[in] @Idx The index operand from the movrel instruction. This must be
    593 // a register, but may be NoRegister.
    594 ///
    595 /// \param[in] @Offset As an input, this is the constant offset part of the
    596 // indirect Index. e.g. v0 = v[VecReg + Offset] As an output, this is a constant
    597 // value that needs to be added to the value stored in M0.
    598 std::pair<unsigned, int>
    599 SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, int Offset) const {
    600   unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0);
    601   if (!SubReg)
    602     SubReg = VecReg;
    603 
    604   const TargetRegisterClass *SuperRC = TRI->getPhysRegClass(VecReg);
    605   const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg);
    606   int NumElts = SuperRC->getSize() / RC->getSize();
    607 
    608   int BaseRegIdx = TRI->getHWRegIndex(SubReg);
    609 
    610   // Skip out of bounds offsets, or else we would end up using an undefined
    611   // register.
    612   if (Offset >= NumElts)
    613     return std::make_pair(RC->getRegister(BaseRegIdx), Offset);
    614 
    615   int RegIdx = BaseRegIdx + Offset;
    616   if (RegIdx < 0) {
    617     Offset = RegIdx;
    618     RegIdx = 0;
    619   } else {
    620     Offset = 0;
    621   }
    622 
    623   unsigned Reg = RC->getRegister(RegIdx);
    624   return std::make_pair(Reg, Offset);
    625 }
    626 
    627 // Return true if a new block was inserted.
    628 bool SILowerControlFlow::indirectSrc(MachineInstr &MI) {
    629   MachineBasicBlock &MBB = *MI.getParent();
    630   const DebugLoc &DL = MI.getDebugLoc();
    631 
    632   unsigned Dst = MI.getOperand(0).getReg();
    633   const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
    634   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
    635   unsigned Reg;
    636 
    637   std::tie(Reg, Offset) = computeIndirectRegAndOffset(SrcVec->getReg(), Offset);
    638 
    639   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
    640   if (Idx->getReg() == AMDGPU::NoRegister) {
    641     // Only had a constant offset, copy the register directly.
    642     BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
    643       .addReg(Reg, getUndefRegState(SrcVec->isUndef()));
    644     MI.eraseFromParent();
    645     return false;
    646   }
    647 
    648   MachineInstr *MovRel =
    649     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
    650     .addReg(Reg, getUndefRegState(SrcVec->isUndef()))
    651     .addReg(SrcVec->getReg(), RegState::Implicit);
    652 
    653   return loadM0(MI, MovRel, Offset);
    654 }
    655 
    656 // Return true if a new block was inserted.
    657 bool SILowerControlFlow::indirectDst(MachineInstr &MI) {
    658   MachineBasicBlock &MBB = *MI.getParent();
    659   const DebugLoc &DL = MI.getDebugLoc();
    660 
    661   unsigned Dst = MI.getOperand(0).getReg();
    662   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
    663   unsigned Reg;
    664 
    665   const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
    666   std::tie(Reg, Offset) = computeIndirectRegAndOffset(Dst, Offset);
    667 
    668   MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
    669   if (Idx->getReg() == AMDGPU::NoRegister) {
    670     // Only had a constant offset, copy the register directly.
    671     BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Reg)
    672       .addOperand(*Val);
    673     MI.eraseFromParent();
    674     return false;
    675   }
    676 
    677   MachineInstr *MovRel =
    678     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32), Reg)
    679     .addReg(Val->getReg(), getUndefRegState(Val->isUndef()))
    680     .addReg(Dst, RegState::Implicit);
    681 
    682   return loadM0(MI, MovRel, Offset);
    683 }
    684 
    685 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
    686   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
    687   TII = ST.getInstrInfo();
    688   TRI = &TII->getRegisterInfo();
    689 
    690   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    691 
    692   bool HaveKill = false;
    693   bool NeedFlat = false;
    694   unsigned Depth = 0;
    695 
    696   MachineFunction::iterator NextBB;
    697 
    698   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
    699        BI != BE; BI = NextBB) {
    700     NextBB = std::next(BI);
    701     MachineBasicBlock &MBB = *BI;
    702 
    703     MachineBasicBlock *EmptyMBBAtEnd = nullptr;
    704     MachineBasicBlock::iterator I, Next;
    705     bool ExecModified = false;
    706 
    707     for (I = MBB.begin(); I != MBB.end(); I = Next) {
    708       Next = std::next(I);
    709 
    710       MachineInstr &MI = *I;
    711 
    712       // Flat uses m0 in case it needs to access LDS.
    713       if (TII->isFLAT(MI))
    714         NeedFlat = true;
    715 
    716       if (I->modifiesRegister(AMDGPU::EXEC, TRI))
    717         ExecModified = true;
    718 
    719       switch (MI.getOpcode()) {
    720         default: break;
    721         case AMDGPU::SI_IF:
    722           ++Depth;
    723           If(MI);
    724           break;
    725 
    726         case AMDGPU::SI_ELSE:
    727           Else(MI, ExecModified);
    728           break;
    729 
    730         case AMDGPU::SI_BREAK:
    731           Break(MI);
    732           break;
    733 
    734         case AMDGPU::SI_IF_BREAK:
    735           IfBreak(MI);
    736           break;
    737 
    738         case AMDGPU::SI_ELSE_BREAK:
    739           ElseBreak(MI);
    740           break;
    741 
    742         case AMDGPU::SI_LOOP:
    743           ++Depth;
    744           Loop(MI);
    745           break;
    746 
    747         case AMDGPU::SI_END_CF:
    748           if (--Depth == 0 && HaveKill) {
    749             HaveKill = false;
    750 
    751             if (skipIfDead(MI, *NextBB)) {
    752               NextBB = std::next(BI);
    753               BE = MF.end();
    754               Next = MBB.end();
    755             }
    756           }
    757           EndCf(MI);
    758           break;
    759 
    760         case AMDGPU::SI_KILL_TERMINATOR:
    761           if (Depth == 0) {
    762             if (skipIfDead(MI, *NextBB)) {
    763               NextBB = std::next(BI);
    764               BE = MF.end();
    765               Next = MBB.end();
    766             }
    767           } else
    768             HaveKill = true;
    769           Kill(MI);
    770           break;
    771 
    772         case AMDGPU::S_BRANCH:
    773           Branch(MI);
    774           break;
    775 
    776         case AMDGPU::SI_INDIRECT_SRC_V1:
    777         case AMDGPU::SI_INDIRECT_SRC_V2:
    778         case AMDGPU::SI_INDIRECT_SRC_V4:
    779         case AMDGPU::SI_INDIRECT_SRC_V8:
    780         case AMDGPU::SI_INDIRECT_SRC_V16:
    781           if (indirectSrc(MI)) {
    782             // The block was split at this point. We can safely skip the middle
    783             // inserted block to the following which contains the rest of this
    784             // block's instructions.
    785             NextBB = std::next(BI);
    786             BE = MF.end();
    787             Next = MBB.end();
    788           }
    789 
    790           break;
    791 
    792         case AMDGPU::SI_INDIRECT_DST_V1:
    793         case AMDGPU::SI_INDIRECT_DST_V2:
    794         case AMDGPU::SI_INDIRECT_DST_V4:
    795         case AMDGPU::SI_INDIRECT_DST_V8:
    796         case AMDGPU::SI_INDIRECT_DST_V16:
    797           if (indirectDst(MI)) {
    798             // The block was split at this point. We can safely skip the middle
    799             // inserted block to the following which contains the rest of this
    800             // block's instructions.
    801             NextBB = std::next(BI);
    802             BE = MF.end();
    803             Next = MBB.end();
    804           }
    805 
    806           break;
    807 
    808         case AMDGPU::SI_RETURN: {
    809           assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
    810 
    811           // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
    812           // because external bytecode will be appended at the end.
    813           if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
    814             // SI_RETURN is not the last instruction. Add an empty block at
    815             // the end and jump there.
    816             if (!EmptyMBBAtEnd) {
    817               EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
    818               MF.insert(MF.end(), EmptyMBBAtEnd);
    819             }
    820 
    821             MBB.addSuccessor(EmptyMBBAtEnd);
    822             BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
    823                     .addMBB(EmptyMBBAtEnd);
    824             I->eraseFromParent();
    825           }
    826           break;
    827         }
    828       }
    829     }
    830   }
    831 
    832   if (NeedFlat && MFI->IsKernel) {
    833     // TODO: What to use with function calls?
    834     // We will need to Initialize the flat scratch register pair.
    835     if (NeedFlat)
    836       MFI->setHasFlatInstructions(true);
    837   }
    838 
    839   return true;
    840 }
    841