Home | History | Annotate | Download | only in AMDGPU
      1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 /// \brief This pass lowers the pseudo control flow instructions to real
     12 /// machine instructions.
     13 ///
     14 /// All control flow is handled using predicated instructions and
     15 /// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
     16 /// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
     17 /// by writting to the 64-bit EXEC register (each bit corresponds to a
     18 /// single vector ALU).  Typically, for predicates, a vector ALU will write
     19 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
     20 /// Vector ALU) and then the ScalarALU will AND the VCC register with the
     21 /// EXEC to update the predicates.
     22 ///
     23 /// For example:
     24 /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
     25 /// %SGPR0 = SI_IF %VCC
     26 ///   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
     27 /// %SGPR0 = SI_ELSE %SGPR0
     28 ///   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
     29 /// SI_END_CF %SGPR0
     30 ///
     31 /// becomes:
     32 ///
     33 /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC  // Save and update the exec mask
     34 /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
     35 /// S_CBRANCH_EXECZ label0            // This instruction is an optional
     36 ///                                   // optimization which allows us to
     37 ///                                   // branch if all the bits of
     38 ///                                   // EXEC are zero.
     39 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
     40 ///
     41 /// label0:
     42 /// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC   // Restore the exec mask for the Then block
     43 /// %EXEC = S_XOR_B64 %SGPR0, %EXEC    // Clear live bits from saved exec mask
     44 /// S_BRANCH_EXECZ label1              // Use our branch optimization
     45 ///                                    // instruction again.
     46 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
     47 /// label1:
     48 /// %EXEC = S_OR_B64 %EXEC, %SGPR0     // Re-enable saved exec mask bits
     49 //===----------------------------------------------------------------------===//
     50 
     51 #include "AMDGPU.h"
     52 #include "AMDGPUSubtarget.h"
     53 #include "SIInstrInfo.h"
     54 #include "SIMachineFunctionInfo.h"
     55 #include "llvm/CodeGen/MachineFrameInfo.h"
     56 #include "llvm/CodeGen/MachineFunction.h"
     57 #include "llvm/CodeGen/MachineFunctionPass.h"
     58 #include "llvm/CodeGen/MachineInstrBuilder.h"
     59 #include "llvm/CodeGen/MachineRegisterInfo.h"
     60 #include "llvm/IR/Constants.h"
     61 
     62 using namespace llvm;
     63 
     64 namespace {
     65 
     66 class SILowerControlFlowPass : public MachineFunctionPass {
     67 
     68 private:
     69   static const unsigned SkipThreshold = 12;
     70 
     71   static char ID;
     72   const SIRegisterInfo *TRI;
     73   const SIInstrInfo *TII;
     74 
     75   bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
     76 
     77   void Skip(MachineInstr &From, MachineOperand &To);
     78   void SkipIfDead(MachineInstr &MI);
     79 
     80   void If(MachineInstr &MI);
     81   void Else(MachineInstr &MI);
     82   void Break(MachineInstr &MI);
     83   void IfBreak(MachineInstr &MI);
     84   void ElseBreak(MachineInstr &MI);
     85   void Loop(MachineInstr &MI);
     86   void EndCf(MachineInstr &MI);
     87 
     88   void Kill(MachineInstr &MI);
     89   void Branch(MachineInstr &MI);
     90 
     91   void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
     92   void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset);
     93   void IndirectSrc(MachineInstr &MI);
     94   void IndirectDst(MachineInstr &MI);
     95 
     96 public:
     97   SILowerControlFlowPass(TargetMachine &tm) :
     98     MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
     99 
    100   bool runOnMachineFunction(MachineFunction &MF) override;
    101 
    102   const char *getPassName() const override {
    103     return "SI Lower control flow instructions";
    104   }
    105 
    106   void getAnalysisUsage(AnalysisUsage &AU) const override {
    107     AU.setPreservesCFG();
    108     MachineFunctionPass::getAnalysisUsage(AU);
    109   }
    110 };
    111 
    112 } // End anonymous namespace
    113 
    114 char SILowerControlFlowPass::ID = 0;
    115 
    116 FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
    117   return new SILowerControlFlowPass(tm);
    118 }
    119 
    120 bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From,
    121                                         MachineBasicBlock *To) {
    122 
    123   unsigned NumInstr = 0;
    124 
    125   for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty();
    126        MBB = *MBB->succ_begin()) {
    127 
    128     for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
    129          NumInstr < SkipThreshold && I != E; ++I) {
    130 
    131       if (I->isBundle() || !I->isBundled())
    132         if (++NumInstr >= SkipThreshold)
    133           return true;
    134     }
    135   }
    136 
    137   return false;
    138 }
    139 
    140 void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
    141 
    142   if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
    143     return;
    144 
    145   DebugLoc DL = From.getDebugLoc();
    146   BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
    147     .addOperand(To);
    148 }
    149 
    150 void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
    151 
    152   MachineBasicBlock &MBB = *MI.getParent();
    153   DebugLoc DL = MI.getDebugLoc();
    154 
    155   if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getShaderType() !=
    156       ShaderType::PIXEL ||
    157       !shouldSkip(&MBB, &MBB.getParent()->back()))
    158     return;
    159 
    160   MachineBasicBlock::iterator Insert = &MI;
    161   ++Insert;
    162 
    163   // If the exec mask is non-zero, skip the next two instructions
    164   BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
    165     .addImm(3);
    166 
    167   // Exec mask is zero: Export to NULL target...
    168   BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
    169           .addImm(0)
    170           .addImm(0x09) // V_008DFC_SQ_EXP_NULL
    171           .addImm(0)
    172           .addImm(1)
    173           .addImm(1)
    174           .addReg(AMDGPU::VGPR0)
    175           .addReg(AMDGPU::VGPR0)
    176           .addReg(AMDGPU::VGPR0)
    177           .addReg(AMDGPU::VGPR0);
    178 
    179   // ... and terminate wavefront
    180   BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
    181 }
    182 
    183 void SILowerControlFlowPass::If(MachineInstr &MI) {
    184   MachineBasicBlock &MBB = *MI.getParent();
    185   DebugLoc DL = MI.getDebugLoc();
    186   unsigned Reg = MI.getOperand(0).getReg();
    187   unsigned Vcc = MI.getOperand(1).getReg();
    188 
    189   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
    190           .addReg(Vcc);
    191 
    192   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
    193           .addReg(AMDGPU::EXEC)
    194           .addReg(Reg);
    195 
    196   Skip(MI, MI.getOperand(2));
    197 
    198   MI.eraseFromParent();
    199 }
    200 
    201 void SILowerControlFlowPass::Else(MachineInstr &MI) {
    202   MachineBasicBlock &MBB = *MI.getParent();
    203   DebugLoc DL = MI.getDebugLoc();
    204   unsigned Dst = MI.getOperand(0).getReg();
    205   unsigned Src = MI.getOperand(1).getReg();
    206 
    207   BuildMI(MBB, MBB.getFirstNonPHI(), DL,
    208           TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
    209           .addReg(Src); // Saved EXEC
    210 
    211   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
    212           .addReg(AMDGPU::EXEC)
    213           .addReg(Dst);
    214 
    215   Skip(MI, MI.getOperand(2));
    216 
    217   MI.eraseFromParent();
    218 }
    219 
    220 void SILowerControlFlowPass::Break(MachineInstr &MI) {
    221   MachineBasicBlock &MBB = *MI.getParent();
    222   DebugLoc DL = MI.getDebugLoc();
    223 
    224   unsigned Dst = MI.getOperand(0).getReg();
    225   unsigned Src = MI.getOperand(1).getReg();
    226 
    227   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
    228           .addReg(AMDGPU::EXEC)
    229           .addReg(Src);
    230 
    231   MI.eraseFromParent();
    232 }
    233 
    234 void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
    235   MachineBasicBlock &MBB = *MI.getParent();
    236   DebugLoc DL = MI.getDebugLoc();
    237 
    238   unsigned Dst = MI.getOperand(0).getReg();
    239   unsigned Vcc = MI.getOperand(1).getReg();
    240   unsigned Src = MI.getOperand(2).getReg();
    241 
    242   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
    243           .addReg(Vcc)
    244           .addReg(Src);
    245 
    246   MI.eraseFromParent();
    247 }
    248 
    249 void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
    250   MachineBasicBlock &MBB = *MI.getParent();
    251   DebugLoc DL = MI.getDebugLoc();
    252 
    253   unsigned Dst = MI.getOperand(0).getReg();
    254   unsigned Saved = MI.getOperand(1).getReg();
    255   unsigned Src = MI.getOperand(2).getReg();
    256 
    257   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
    258           .addReg(Saved)
    259           .addReg(Src);
    260 
    261   MI.eraseFromParent();
    262 }
    263 
    264 void SILowerControlFlowPass::Loop(MachineInstr &MI) {
    265   MachineBasicBlock &MBB = *MI.getParent();
    266   DebugLoc DL = MI.getDebugLoc();
    267   unsigned Src = MI.getOperand(0).getReg();
    268 
    269   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
    270           .addReg(AMDGPU::EXEC)
    271           .addReg(Src);
    272 
    273   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
    274     .addOperand(MI.getOperand(1));
    275 
    276   MI.eraseFromParent();
    277 }
    278 
    279 void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
    280   MachineBasicBlock &MBB = *MI.getParent();
    281   DebugLoc DL = MI.getDebugLoc();
    282   unsigned Reg = MI.getOperand(0).getReg();
    283 
    284   BuildMI(MBB, MBB.getFirstNonPHI(), DL,
    285           TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
    286           .addReg(AMDGPU::EXEC)
    287           .addReg(Reg);
    288 
    289   MI.eraseFromParent();
    290 }
    291 
    292 void SILowerControlFlowPass::Branch(MachineInstr &MI) {
    293   if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode())
    294     MI.eraseFromParent();
    295 
    296   // If these aren't equal, this is probably an infinite loop.
    297 }
    298 
    299 void SILowerControlFlowPass::Kill(MachineInstr &MI) {
    300   MachineBasicBlock &MBB = *MI.getParent();
    301   DebugLoc DL = MI.getDebugLoc();
    302   const MachineOperand &Op = MI.getOperand(0);
    303 
    304 #ifndef NDEBUG
    305   const SIMachineFunctionInfo *MFI
    306     = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
    307   // Kill is only allowed in pixel / geometry shaders.
    308   assert(MFI->getShaderType() == ShaderType::PIXEL ||
    309          MFI->getShaderType() == ShaderType::GEOMETRY);
    310 #endif
    311 
    312   // Clear this thread from the exec mask if the operand is negative
    313   if ((Op.isImm())) {
    314     // Constant operand: Set exec mask to 0 or do nothing
    315     if (Op.getImm() & 0x80000000) {
    316       BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
    317               .addImm(0);
    318     }
    319   } else {
    320     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
    321            .addImm(0)
    322            .addOperand(Op);
    323   }
    324 
    325   MI.eraseFromParent();
    326 }
    327 
    328 void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
    329 
    330   MachineBasicBlock &MBB = *MI.getParent();
    331   DebugLoc DL = MI.getDebugLoc();
    332   MachineBasicBlock::iterator I = MI;
    333 
    334   unsigned Save = MI.getOperand(1).getReg();
    335   unsigned Idx = MI.getOperand(3).getReg();
    336 
    337   if (AMDGPU::SReg_32RegClass.contains(Idx)) {
    338     if (Offset) {
    339       BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
    340               .addReg(Idx)
    341               .addImm(Offset);
    342     } else {
    343       BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
    344               .addReg(Idx);
    345     }
    346     MBB.insert(I, MovRel);
    347   } else {
    348 
    349     assert(AMDGPU::SReg_64RegClass.contains(Save));
    350     assert(AMDGPU::VGPR_32RegClass.contains(Idx));
    351 
    352     // Save the EXEC mask
    353     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
    354             .addReg(AMDGPU::EXEC);
    355 
    356     // Read the next variant into VCC (lower 32 bits) <- also loop target
    357     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
    358             AMDGPU::VCC_LO)
    359             .addReg(Idx);
    360 
    361     // Move index from VCC into M0
    362     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
    363             .addReg(AMDGPU::VCC_LO);
    364 
    365     // Compare the just read M0 value to all possible Idx values
    366     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
    367       .addReg(AMDGPU::M0)
    368       .addReg(Idx);
    369 
    370     // Update EXEC, save the original EXEC value to VCC
    371     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
    372             .addReg(AMDGPU::VCC);
    373 
    374     if (Offset) {
    375       BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
    376               .addReg(AMDGPU::M0)
    377               .addImm(Offset);
    378     }
    379     // Do the actual move
    380     MBB.insert(I, MovRel);
    381 
    382     // Update EXEC, switch all done bits to 0 and all todo bits to 1
    383     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
    384             .addReg(AMDGPU::EXEC)
    385             .addReg(AMDGPU::VCC);
    386 
    387     // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
    388     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
    389       .addImm(-7);
    390 
    391     // Restore EXEC
    392     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
    393             .addReg(Save);
    394 
    395   }
    396   MI.eraseFromParent();
    397 }
    398 
    399 /// \param @VecReg The register which holds element zero of the vector
    400 ///                 being addressed into.
    401 /// \param[out] @Reg The base register to use in the indirect addressing instruction.
    402 /// \param[in,out] @Offset As an input, this is the constant offset part of the
    403 //                         indirect Index. e.g. v0 = v[VecReg + Offset]
    404 //                         As an output, this is a constant value that needs
    405 //                         to be added to the value stored in M0.
    406 void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg,
    407                                                          unsigned &Reg,
    408                                                          int &Offset) {
    409   unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0);
    410   if (!SubReg)
    411     SubReg = VecReg;
    412 
    413   const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg);
    414   int RegIdx = TRI->getHWRegIndex(SubReg) + Offset;
    415 
    416   if (RegIdx < 0) {
    417     Offset = RegIdx;
    418     RegIdx = 0;
    419   } else {
    420     Offset = 0;
    421   }
    422 
    423   Reg = RC->getRegister(RegIdx);
    424 }
    425 
    426 void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
    427 
    428   MachineBasicBlock &MBB = *MI.getParent();
    429   DebugLoc DL = MI.getDebugLoc();
    430 
    431   unsigned Dst = MI.getOperand(0).getReg();
    432   unsigned Vec = MI.getOperand(2).getReg();
    433   int Off = MI.getOperand(4).getImm();
    434   unsigned Reg;
    435 
    436   computeIndirectRegAndOffset(Vec, Reg, Off);
    437 
    438   MachineInstr *MovRel =
    439     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
    440             .addReg(Reg)
    441             .addReg(Vec, RegState::Implicit);
    442 
    443   LoadM0(MI, MovRel, Off);
    444 }
    445 
    446 void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
    447 
    448   MachineBasicBlock &MBB = *MI.getParent();
    449   DebugLoc DL = MI.getDebugLoc();
    450 
    451   unsigned Dst = MI.getOperand(0).getReg();
    452   int Off = MI.getOperand(4).getImm();
    453   unsigned Val = MI.getOperand(5).getReg();
    454   unsigned Reg;
    455 
    456   computeIndirectRegAndOffset(Dst, Reg, Off);
    457 
    458   MachineInstr *MovRel =
    459     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
    460             .addReg(Reg, RegState::Define)
    461             .addReg(Val)
    462             .addReg(Dst, RegState::Implicit);
    463 
    464   LoadM0(MI, MovRel, Off);
    465 }
    466 
    467 bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
    468   TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
    469   TRI =
    470       static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
    471   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    472 
    473   bool HaveKill = false;
    474   bool NeedWQM = false;
    475   bool NeedFlat = false;
    476   unsigned Depth = 0;
    477 
    478   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
    479        BI != BE; ++BI) {
    480 
    481     MachineBasicBlock &MBB = *BI;
    482     MachineBasicBlock::iterator I, Next;
    483     for (I = MBB.begin(); I != MBB.end(); I = Next) {
    484       Next = std::next(I);
    485 
    486       MachineInstr &MI = *I;
    487       if (TII->isWQM(MI) || TII->isDS(MI))
    488         NeedWQM = true;
    489 
    490       // Flat uses m0 in case it needs to access LDS.
    491       if (TII->isFLAT(MI))
    492         NeedFlat = true;
    493 
    494       switch (MI.getOpcode()) {
    495         default: break;
    496         case AMDGPU::SI_IF:
    497           ++Depth;
    498           If(MI);
    499           break;
    500 
    501         case AMDGPU::SI_ELSE:
    502           Else(MI);
    503           break;
    504 
    505         case AMDGPU::SI_BREAK:
    506           Break(MI);
    507           break;
    508 
    509         case AMDGPU::SI_IF_BREAK:
    510           IfBreak(MI);
    511           break;
    512 
    513         case AMDGPU::SI_ELSE_BREAK:
    514           ElseBreak(MI);
    515           break;
    516 
    517         case AMDGPU::SI_LOOP:
    518           ++Depth;
    519           Loop(MI);
    520           break;
    521 
    522         case AMDGPU::SI_END_CF:
    523           if (--Depth == 0 && HaveKill) {
    524             SkipIfDead(MI);
    525             HaveKill = false;
    526           }
    527           EndCf(MI);
    528           break;
    529 
    530         case AMDGPU::SI_KILL:
    531           if (Depth == 0)
    532             SkipIfDead(MI);
    533           else
    534             HaveKill = true;
    535           Kill(MI);
    536           break;
    537 
    538         case AMDGPU::S_BRANCH:
    539           Branch(MI);
    540           break;
    541 
    542         case AMDGPU::SI_INDIRECT_SRC_V1:
    543         case AMDGPU::SI_INDIRECT_SRC_V2:
    544         case AMDGPU::SI_INDIRECT_SRC_V4:
    545         case AMDGPU::SI_INDIRECT_SRC_V8:
    546         case AMDGPU::SI_INDIRECT_SRC_V16:
    547           IndirectSrc(MI);
    548           break;
    549 
    550         case AMDGPU::SI_INDIRECT_DST_V1:
    551         case AMDGPU::SI_INDIRECT_DST_V2:
    552         case AMDGPU::SI_INDIRECT_DST_V4:
    553         case AMDGPU::SI_INDIRECT_DST_V8:
    554         case AMDGPU::SI_INDIRECT_DST_V16:
    555           IndirectDst(MI);
    556           break;
    557       }
    558     }
    559   }
    560 
    561   if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
    562     MachineBasicBlock &MBB = MF.front();
    563     BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
    564             AMDGPU::EXEC).addReg(AMDGPU::EXEC);
    565   }
    566 
    567   // FIXME: This seems inappropriate to do here.
    568   if (NeedFlat && MFI->IsKernel) {
    569     // Insert the prologue initializing the SGPRs pointing to the scratch space
    570     // for flat accesses.
    571     const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
    572 
    573     // TODO: What to use with function calls?
    574 
    575     // FIXME: This is reporting stack size that is used in a scratch buffer
    576     // rather than registers as well.
    577     uint64_t StackSizeBytes = FrameInfo->getStackSize();
    578 
    579     int IndirectBegin
    580       = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF);
    581     // Convert register index to 256-byte unit.
    582     uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256);
    583 
    584     assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff &&
    585            "Stack limits should be smaller than 16-bits");
    586 
    587     // Initialize the flat scratch register pair.
    588     // TODO: Can we use one s_mov_b64 here?
    589 
    590     // Offset is in units of 256-bytes.
    591     MachineBasicBlock &MBB = MF.front();
    592     DebugLoc NoDL;
    593     MachineBasicBlock::iterator Start = MBB.getFirstNonPHI();
    594     const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32);
    595 
    596     assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes));
    597 
    598     BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO)
    599       .addImm(StackOffset);
    600 
    601     // Documentation says size is "per-thread scratch size in bytes"
    602     BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI)
    603       .addImm(StackSizeBytes);
    604   }
    605 
    606   return true;
    607 }
    608