Home | History | Annotate | Download | only in AMDGPU
      1 //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 /// This pass inserts branches on the 0 exec mask over divergent branches
     12 /// branches when it's expected that jumping over the untaken control flow will
     13 /// be cheaper than having every workitem no-op through it.
     14 //
     15 //===----------------------------------------------------------------------===//
     16 
     17 #include "AMDGPU.h"
     18 #include "AMDGPUSubtarget.h"
     19 #include "SIInstrInfo.h"
     20 #include "SIMachineFunctionInfo.h"
     21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
     22 #include "llvm/ADT/SmallVector.h"
     23 #include "llvm/ADT/StringRef.h"
     24 #include "llvm/CodeGen/MachineBasicBlock.h"
     25 #include "llvm/CodeGen/MachineFunction.h"
     26 #include "llvm/CodeGen/MachineFunctionPass.h"
     27 #include "llvm/CodeGen/MachineInstr.h"
     28 #include "llvm/CodeGen/MachineInstrBuilder.h"
     29 #include "llvm/CodeGen/MachineOperand.h"
     30 #include "llvm/IR/CallingConv.h"
     31 #include "llvm/IR/DebugLoc.h"
     32 #include "llvm/MC/MCAsmInfo.h"
     33 #include "llvm/Pass.h"
     34 #include "llvm/Support/CommandLine.h"
     35 #include "llvm/Target/TargetMachine.h"
     36 #include <cassert>
     37 #include <cstdint>
     38 #include <iterator>
     39 
     40 using namespace llvm;
     41 
     42 #define DEBUG_TYPE "si-insert-skips"
     43 
     44 static cl::opt<unsigned> SkipThresholdFlag(
     45   "amdgpu-skip-threshold",
     46   cl::desc("Number of instructions before jumping over divergent control flow"),
     47   cl::init(12), cl::Hidden);
     48 
     49 namespace {
     50 
     51 class SIInsertSkips : public MachineFunctionPass {
     52 private:
     53   const SIRegisterInfo *TRI = nullptr;
     54   const SIInstrInfo *TII = nullptr;
     55   unsigned SkipThreshold = 0;
     56 
     57   bool shouldSkip(const MachineBasicBlock &From,
     58                   const MachineBasicBlock &To) const;
     59 
     60   bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
     61 
     62   void kill(MachineInstr &MI);
     63 
     64   MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
     65                                      MachineBasicBlock::iterator I) const;
     66 
     67   bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
     68 
     69 public:
     70   static char ID;
     71 
     72   SIInsertSkips() : MachineFunctionPass(ID) {}
     73 
     74   bool runOnMachineFunction(MachineFunction &MF) override;
     75 
     76   StringRef getPassName() const override {
     77     return "SI insert s_cbranch_execz instructions";
     78   }
     79 
     80   void getAnalysisUsage(AnalysisUsage &AU) const override {
     81     MachineFunctionPass::getAnalysisUsage(AU);
     82   }
     83 };
     84 
     85 } // end anonymous namespace
     86 
     87 char SIInsertSkips::ID = 0;
     88 
     89 INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
     90                 "SI insert s_cbranch_execz instructions", false, false)
     91 
     92 char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
     93 
     94 static bool opcodeEmitsNoInsts(unsigned Opc) {
     95   switch (Opc) {
     96   case TargetOpcode::IMPLICIT_DEF:
     97   case TargetOpcode::KILL:
     98   case TargetOpcode::BUNDLE:
     99   case TargetOpcode::CFI_INSTRUCTION:
    100   case TargetOpcode::EH_LABEL:
    101   case TargetOpcode::GC_LABEL:
    102   case TargetOpcode::DBG_VALUE:
    103     return true;
    104   default:
    105     return false;
    106   }
    107 }
    108 
    109 bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
    110                                const MachineBasicBlock &To) const {
    111   if (From.succ_empty())
    112     return false;
    113 
    114   unsigned NumInstr = 0;
    115   const MachineFunction *MF = From.getParent();
    116 
    117   for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
    118        MBBI != End && MBBI != ToI; ++MBBI) {
    119     const MachineBasicBlock &MBB = *MBBI;
    120 
    121     for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
    122          NumInstr < SkipThreshold && I != E; ++I) {
    123       if (opcodeEmitsNoInsts(I->getOpcode()))
    124         continue;
    125 
    126       // FIXME: Since this is required for correctness, this should be inserted
    127       // during SILowerControlFlow.
    128 
    129       // When a uniform loop is inside non-uniform control flow, the branch
    130       // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
    131       // when EXEC = 0. We should skip the loop lest it becomes infinite.
    132       if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
    133           I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
    134         return true;
    135 
    136       if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
    137         return true;
    138 
    139       ++NumInstr;
    140       if (NumInstr >= SkipThreshold)
    141         return true;
    142     }
    143   }
    144 
    145   return false;
    146 }
    147 
    148 bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
    149   MachineBasicBlock &MBB = *MI.getParent();
    150   MachineFunction *MF = MBB.getParent();
    151 
    152   if (MF->getFunction().getCallingConv() != CallingConv::AMDGPU_PS ||
    153       !shouldSkip(MBB, MBB.getParent()->back()))
    154     return false;
    155 
    156   MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
    157 
    158   const DebugLoc &DL = MI.getDebugLoc();
    159 
    160   // If the exec mask is non-zero, skip the next two instructions
    161   BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
    162     .addMBB(&NextBB);
    163 
    164   MachineBasicBlock::iterator Insert = SkipBB->begin();
    165 
    166   // Exec mask is zero: Export to NULL target...
    167   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
    168     .addImm(0x09) // V_008DFC_SQ_EXP_NULL
    169     .addReg(AMDGPU::VGPR0, RegState::Undef)
    170     .addReg(AMDGPU::VGPR0, RegState::Undef)
    171     .addReg(AMDGPU::VGPR0, RegState::Undef)
    172     .addReg(AMDGPU::VGPR0, RegState::Undef)
    173     .addImm(1)  // vm
    174     .addImm(0)  // compr
    175     .addImm(0); // en
    176 
    177   // ... and terminate wavefront.
    178   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
    179 
    180   return true;
    181 }
    182 
    183 void SIInsertSkips::kill(MachineInstr &MI) {
    184   MachineBasicBlock &MBB = *MI.getParent();
    185   DebugLoc DL = MI.getDebugLoc();
    186 
    187   switch (MI.getOpcode()) {
    188   case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
    189     unsigned Opcode = 0;
    190 
    191     // The opcodes are inverted because the inline immediate has to be
    192     // the first operand, e.g. from "x < imm" to "imm > x"
    193     switch (MI.getOperand(2).getImm()) {
    194     case ISD::SETOEQ:
    195     case ISD::SETEQ:
    196       Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
    197       break;
    198     case ISD::SETOGT:
    199     case ISD::SETGT:
    200       Opcode = AMDGPU::V_CMPX_LT_F32_e64;
    201       break;
    202     case ISD::SETOGE:
    203     case ISD::SETGE:
    204       Opcode = AMDGPU::V_CMPX_LE_F32_e64;
    205       break;
    206     case ISD::SETOLT:
    207     case ISD::SETLT:
    208       Opcode = AMDGPU::V_CMPX_GT_F32_e64;
    209       break;
    210     case ISD::SETOLE:
    211     case ISD::SETLE:
    212       Opcode = AMDGPU::V_CMPX_GE_F32_e64;
    213       break;
    214     case ISD::SETONE:
    215     case ISD::SETNE:
    216       Opcode = AMDGPU::V_CMPX_LG_F32_e64;
    217       break;
    218     case ISD::SETO:
    219       Opcode = AMDGPU::V_CMPX_O_F32_e64;
    220       break;
    221     case ISD::SETUO:
    222       Opcode = AMDGPU::V_CMPX_U_F32_e64;
    223       break;
    224     case ISD::SETUEQ:
    225       Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
    226       break;
    227     case ISD::SETUGT:
    228       Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
    229       break;
    230     case ISD::SETUGE:
    231       Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
    232       break;
    233     case ISD::SETULT:
    234       Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
    235       break;
    236     case ISD::SETULE:
    237       Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
    238       break;
    239     case ISD::SETUNE:
    240       Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
    241       break;
    242     default:
    243       llvm_unreachable("invalid ISD:SET cond code");
    244     }
    245 
    246     assert(MI.getOperand(0).isReg());
    247 
    248     if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
    249                     MI.getOperand(0).getReg())) {
    250       Opcode = AMDGPU::getVOPe32(Opcode);
    251       BuildMI(MBB, &MI, DL, TII->get(Opcode))
    252           .add(MI.getOperand(1))
    253           .add(MI.getOperand(0));
    254     } else {
    255       BuildMI(MBB, &MI, DL, TII->get(Opcode))
    256           .addReg(AMDGPU::VCC, RegState::Define)
    257           .addImm(0)  // src0 modifiers
    258           .add(MI.getOperand(1))
    259           .addImm(0)  // src1 modifiers
    260           .add(MI.getOperand(0))
    261           .addImm(0);  // omod
    262     }
    263     break;
    264   }
    265   case AMDGPU::SI_KILL_I1_TERMINATOR: {
    266     const MachineOperand &Op = MI.getOperand(0);
    267     int64_t KillVal = MI.getOperand(1).getImm();
    268     assert(KillVal == 0 || KillVal == -1);
    269 
    270     // Kill all threads if Op0 is an immediate and equal to the Kill value.
    271     if (Op.isImm()) {
    272       int64_t Imm = Op.getImm();
    273       assert(Imm == 0 || Imm == -1);
    274 
    275       if (Imm == KillVal)
    276         BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
    277           .addImm(0);
    278       break;
    279     }
    280 
    281     unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
    282     BuildMI(MBB, &MI, DL, TII->get(Opcode), AMDGPU::EXEC)
    283         .addReg(AMDGPU::EXEC)
    284         .add(Op);
    285     break;
    286   }
    287   default:
    288     llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
    289   }
    290 }
    291 
    292 MachineBasicBlock *SIInsertSkips::insertSkipBlock(
    293   MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
    294   MachineFunction *MF = MBB.getParent();
    295 
    296   MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
    297   MachineFunction::iterator MBBI(MBB);
    298   ++MBBI;
    299 
    300   MF->insert(MBBI, SkipBB);
    301   MBB.addSuccessor(SkipBB);
    302 
    303   return SkipBB;
    304 }
    305 
    306 // Returns true if a branch over the block was inserted.
    307 bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
    308                                    MachineBasicBlock &SrcMBB) {
    309   MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
    310 
    311   if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
    312     return false;
    313 
    314   const DebugLoc &DL = MI.getDebugLoc();
    315   MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
    316 
    317   BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
    318     .addMBB(DestBB);
    319 
    320   return true;
    321 }
    322 
    323 bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
    324   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
    325   TII = ST.getInstrInfo();
    326   TRI = &TII->getRegisterInfo();
    327   SkipThreshold = SkipThresholdFlag;
    328 
    329   bool HaveKill = false;
    330   bool MadeChange = false;
    331 
    332   // Track depth of exec mask, divergent branches.
    333   SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
    334 
    335   MachineFunction::iterator NextBB;
    336 
    337   MachineBasicBlock *EmptyMBBAtEnd = nullptr;
    338 
    339   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
    340        BI != BE; BI = NextBB) {
    341     NextBB = std::next(BI);
    342     MachineBasicBlock &MBB = *BI;
    343     bool HaveSkipBlock = false;
    344 
    345     if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
    346       // Reached convergence point for last divergent branch.
    347       ExecBranchStack.pop_back();
    348     }
    349 
    350     if (HaveKill && ExecBranchStack.empty()) {
    351       HaveKill = false;
    352 
    353       // TODO: Insert skip if exec is 0?
    354     }
    355 
    356     MachineBasicBlock::iterator I, Next;
    357     for (I = MBB.begin(); I != MBB.end(); I = Next) {
    358       Next = std::next(I);
    359 
    360       MachineInstr &MI = *I;
    361 
    362       switch (MI.getOpcode()) {
    363       case AMDGPU::SI_MASK_BRANCH:
    364         ExecBranchStack.push_back(MI.getOperand(0).getMBB());
    365         MadeChange |= skipMaskBranch(MI, MBB);
    366         break;
    367 
    368       case AMDGPU::S_BRANCH:
    369         // Optimize out branches to the next block.
    370         // FIXME: Shouldn't this be handled by BranchFolding?
    371         if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
    372           MI.eraseFromParent();
    373         } else if (HaveSkipBlock) {
    374           // Remove the given unconditional branch when a skip block has been
    375           // inserted after the current one and let skip the two instructions
    376           // performing the kill if the exec mask is non-zero.
    377           MI.eraseFromParent();
    378         }
    379         break;
    380 
    381       case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
    382       case AMDGPU::SI_KILL_I1_TERMINATOR:
    383         MadeChange = true;
    384         kill(MI);
    385 
    386         if (ExecBranchStack.empty()) {
    387           if (skipIfDead(MI, *NextBB)) {
    388             HaveSkipBlock = true;
    389             NextBB = std::next(BI);
    390             BE = MF.end();
    391           }
    392         } else {
    393           HaveKill = true;
    394         }
    395 
    396         MI.eraseFromParent();
    397         break;
    398 
    399       case AMDGPU::SI_RETURN_TO_EPILOG:
    400         // FIXME: Should move somewhere else
    401         assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
    402 
    403         // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
    404         // because external bytecode will be appended at the end.
    405         if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
    406           // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
    407           // the end and jump there.
    408           if (!EmptyMBBAtEnd) {
    409             EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
    410             MF.insert(MF.end(), EmptyMBBAtEnd);
    411           }
    412 
    413           MBB.addSuccessor(EmptyMBBAtEnd);
    414           BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
    415             .addMBB(EmptyMBBAtEnd);
    416           I->eraseFromParent();
    417         }
    418         break;
    419 
    420       default:
    421         break;
    422       }
    423     }
    424   }
    425 
    426   return MadeChange;
    427 }
    428