Home | History | Annotate | Download | only in AMDGPU
      1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 /// \brief Insert wait instructions for memory reads and writes.
     12 ///
     13 /// Memory reads and writes are issued asynchronously, so we need to insert
     14 /// S_WAITCNT instructions when we want to access any of their results or
     15 /// overwrite any register that's used asynchronously.
     16 //
     17 //===----------------------------------------------------------------------===//
     18 
     19 #include "AMDGPU.h"
     20 #include "AMDGPUSubtarget.h"
     21 #include "SIDefines.h"
     22 #include "SIInstrInfo.h"
     23 #include "SIMachineFunctionInfo.h"
     24 #include "llvm/CodeGen/MachineFunction.h"
     25 #include "llvm/CodeGen/MachineFunctionPass.h"
     26 #include "llvm/CodeGen/MachineInstrBuilder.h"
     27 #include "llvm/CodeGen/MachineRegisterInfo.h"
     28 
     29 using namespace llvm;
     30 
     31 namespace {
     32 
     33 /// \brief One variable for each of the hardware counters
     34 typedef union {
     35   struct {
     36     unsigned VM;
     37     unsigned EXP;
     38     unsigned LGKM;
     39   } Named;
     40   unsigned Array[3];
     41 
     42 } Counters;
     43 
     44 typedef enum {
     45   OTHER,
     46   SMEM,
     47   VMEM
     48 } InstType;
     49 
     50 typedef Counters RegCounters[512];
     51 typedef std::pair<unsigned, unsigned> RegInterval;
     52 
     53 class SIInsertWaits : public MachineFunctionPass {
     54 
     55 private:
     56   static char ID;
     57   const SIInstrInfo *TII;
     58   const SIRegisterInfo *TRI;
     59   const MachineRegisterInfo *MRI;
     60 
     61   /// \brief Constant hardware limits
     62   static const Counters WaitCounts;
     63 
     64   /// \brief Constant zero value
     65   static const Counters ZeroCounts;
     66 
     67   /// \brief Counter values we have already waited on.
     68   Counters WaitedOn;
     69 
     70   /// \brief Counter values for last instruction issued.
     71   Counters LastIssued;
     72 
     73   /// \brief Registers used by async instructions.
     74   RegCounters UsedRegs;
     75 
     76   /// \brief Registers defined by async instructions.
     77   RegCounters DefinedRegs;
     78 
     79   /// \brief Different export instruction types seen since last wait.
     80   unsigned ExpInstrTypesSeen;
     81 
     82   /// \brief Type of the last opcode.
     83   InstType LastOpcodeType;
     84 
     85   bool LastInstWritesM0;
     86 
     87   /// \brief Get increment/decrement amount for this instruction.
     88   Counters getHwCounts(MachineInstr &MI);
     89 
     90   /// \brief Is operand relevant for async execution?
     91   bool isOpRelevant(MachineOperand &Op);
     92 
     93   /// \brief Get register interval an operand affects.
     94   RegInterval getRegInterval(const TargetRegisterClass *RC,
     95                              const MachineOperand &Reg) const;
     96 
     97   /// \brief Handle instructions async components
     98   void pushInstruction(MachineBasicBlock &MBB,
     99                        MachineBasicBlock::iterator I);
    100 
    101   /// \brief Insert the actual wait instruction
    102   bool insertWait(MachineBasicBlock &MBB,
    103                   MachineBasicBlock::iterator I,
    104                   const Counters &Counts);
    105 
    106   /// \brief Do we need def2def checks?
    107   bool unorderedDefines(MachineInstr &MI);
    108 
    109   /// \brief Resolve all operand dependencies to counter requirements
    110   Counters handleOperands(MachineInstr &MI);
    111 
    112   /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
    113   void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
    114 
    115 public:
    116   SIInsertWaits(TargetMachine &tm) :
    117     MachineFunctionPass(ID),
    118     TII(nullptr),
    119     TRI(nullptr),
    120     ExpInstrTypesSeen(0) { }
    121 
    122   bool runOnMachineFunction(MachineFunction &MF) override;
    123 
    124   const char *getPassName() const override {
    125     return "SI insert wait instructions";
    126   }
    127 
    128   void getAnalysisUsage(AnalysisUsage &AU) const override {
    129     AU.setPreservesCFG();
    130     MachineFunctionPass::getAnalysisUsage(AU);
    131   }
    132 };
    133 
    134 } // End anonymous namespace
    135 
    136 char SIInsertWaits::ID = 0;
    137 
    138 const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } };
    139 const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
    140 
    141 FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
    142   return new SIInsertWaits(tm);
    143 }
    144 
    145 Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
    146   uint64_t TSFlags = MI.getDesc().TSFlags;
    147   Counters Result = { { 0, 0, 0 } };
    148 
    149   Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
    150 
    151   // Only consider stores or EXP for EXP_CNT
    152   Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
    153       (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore()));
    154 
    155   // LGKM may uses larger values
    156   if (TSFlags & SIInstrFlags::LGKM_CNT) {
    157 
    158     if (TII->isSMRD(MI)) {
    159 
    160       if (MI.getNumOperands() != 0) {
    161         assert(MI.getOperand(0).isReg() &&
    162                "First LGKM operand must be a register!");
    163 
    164         // XXX - What if this is a write into a super register?
    165         const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
    166         unsigned Size = RC->getSize();
    167         Result.Named.LGKM = Size > 4 ? 2 : 1;
    168       } else {
    169         // s_dcache_inv etc. do not have a a destination register. Assume we
    170         // want a wait on these.
    171         // XXX - What is the right value?
    172         Result.Named.LGKM = 1;
    173       }
    174     } else {
    175       // DS
    176       Result.Named.LGKM = 1;
    177     }
    178 
    179   } else {
    180     Result.Named.LGKM = 0;
    181   }
    182 
    183   return Result;
    184 }
    185 
    186 bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
    187   // Constants are always irrelevant
    188   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
    189     return false;
    190 
    191   // Defines are always relevant
    192   if (Op.isDef())
    193     return true;
    194 
    195   // For exports all registers are relevant
    196   MachineInstr &MI = *Op.getParent();
    197   if (MI.getOpcode() == AMDGPU::EXP)
    198     return true;
    199 
    200   // For stores the stored value is also relevant
    201   if (!MI.getDesc().mayStore())
    202     return false;
    203 
    204   // Check if this operand is the value being stored.
    205   // Special case for DS instructions, since the address
    206   // operand comes before the value operand and it may have
    207   // multiple data operands.
    208 
    209   if (TII->isDS(MI)) {
    210     MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data);
    211     if (Data && Op.isIdenticalTo(*Data))
    212       return true;
    213 
    214     MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
    215     if (Data0 && Op.isIdenticalTo(*Data0))
    216       return true;
    217 
    218     MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
    219     if (Data1 && Op.isIdenticalTo(*Data1))
    220       return true;
    221 
    222     return false;
    223   }
    224 
    225   // NOTE: This assumes that the value operand is before the
    226   // address operand, and that there is only one value operand.
    227   for (MachineInstr::mop_iterator I = MI.operands_begin(),
    228        E = MI.operands_end(); I != E; ++I) {
    229 
    230     if (I->isReg() && I->isUse())
    231       return Op.isIdenticalTo(*I);
    232   }
    233 
    234   return false;
    235 }
    236 
    237 RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
    238                                           const MachineOperand &Reg) const {
    239   unsigned Size = RC->getSize();
    240   assert(Size >= 4);
    241 
    242   RegInterval Result;
    243   Result.first = TRI->getEncodingValue(Reg.getReg());
    244   Result.second = Result.first + Size / 4;
    245 
    246   return Result;
    247 }
    248 
    249 void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
    250                                     MachineBasicBlock::iterator I) {
    251 
    252   // Get the hardware counter increments and sum them up
    253   Counters Increment = getHwCounts(*I);
    254   Counters Limit = ZeroCounts;
    255   unsigned Sum = 0;
    256 
    257   for (unsigned i = 0; i < 3; ++i) {
    258     LastIssued.Array[i] += Increment.Array[i];
    259     if (Increment.Array[i])
    260       Limit.Array[i] = LastIssued.Array[i];
    261     Sum += Increment.Array[i];
    262   }
    263 
    264   // If we don't increase anything then that's it
    265   if (Sum == 0) {
    266     LastOpcodeType = OTHER;
    267     return;
    268   }
    269 
    270   if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >=
    271       AMDGPUSubtarget::VOLCANIC_ISLANDS) {
    272     // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
    273     // or SMEM clause, respectively.
    274     //
    275     // The temporary workaround is to break the clauses with S_NOP.
    276     //
    277     // The proper solution would be to allocate registers such that all source
    278     // and destination registers don't overlap, e.g. this is illegal:
    279     //   r0 = load r2
    280     //   r2 = load r0
    281     if ((LastOpcodeType == SMEM && TII->isSMRD(*I)) ||
    282         (LastOpcodeType == VMEM && Increment.Named.VM)) {
    283       // Insert a NOP to break the clause.
    284       BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
    285           .addImm(0);
    286       LastInstWritesM0 = false;
    287     }
    288 
    289     if (TII->isSMRD(*I))
    290       LastOpcodeType = SMEM;
    291     else if (Increment.Named.VM)
    292       LastOpcodeType = VMEM;
    293   }
    294 
    295   // Remember which export instructions we have seen
    296   if (Increment.Named.EXP) {
    297     ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2;
    298   }
    299 
    300   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
    301     MachineOperand &Op = I->getOperand(i);
    302     if (!isOpRelevant(Op))
    303       continue;
    304 
    305     const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
    306     RegInterval Interval = getRegInterval(RC, Op);
    307     for (unsigned j = Interval.first; j < Interval.second; ++j) {
    308 
    309       // Remember which registers we define
    310       if (Op.isDef())
    311         DefinedRegs[j] = Limit;
    312 
    313       // and which one we are using
    314       if (Op.isUse())
    315         UsedRegs[j] = Limit;
    316     }
    317   }
    318 }
    319 
    320 bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
    321                                MachineBasicBlock::iterator I,
    322                                const Counters &Required) {
    323 
    324   // End of program? No need to wait on anything
    325   if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM)
    326     return false;
    327 
    328   // Figure out if the async instructions execute in order
    329   bool Ordered[3];
    330 
    331   // VM_CNT is always ordered
    332   Ordered[0] = true;
    333 
    334   // EXP_CNT is unordered if we have both EXP & VM-writes
    335   Ordered[1] = ExpInstrTypesSeen == 3;
    336 
    337   // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
    338   Ordered[2] = false;
    339 
    340   // The values we are going to put into the S_WAITCNT instruction
    341   Counters Counts = WaitCounts;
    342 
    343   // Do we really need to wait?
    344   bool NeedWait = false;
    345 
    346   for (unsigned i = 0; i < 3; ++i) {
    347 
    348     if (Required.Array[i] <= WaitedOn.Array[i])
    349       continue;
    350 
    351     NeedWait = true;
    352 
    353     if (Ordered[i]) {
    354       unsigned Value = LastIssued.Array[i] - Required.Array[i];
    355 
    356       // Adjust the value to the real hardware possibilities.
    357       Counts.Array[i] = std::min(Value, WaitCounts.Array[i]);
    358 
    359     } else
    360       Counts.Array[i] = 0;
    361 
    362     // Remember on what we have waited on.
    363     WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
    364   }
    365 
    366   if (!NeedWait)
    367     return false;
    368 
    369   // Reset EXP_CNT instruction types
    370   if (Counts.Named.EXP == 0)
    371     ExpInstrTypesSeen = 0;
    372 
    373   // Build the wait instruction
    374   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
    375           .addImm((Counts.Named.VM & 0xF) |
    376                   ((Counts.Named.EXP & 0x7) << 4) |
    377                   ((Counts.Named.LGKM & 0x7) << 8));
    378 
    379   LastOpcodeType = OTHER;
    380   LastInstWritesM0 = false;
    381   return true;
    382 }
    383 
    384 /// \brief helper function for handleOperands
    385 static void increaseCounters(Counters &Dst, const Counters &Src) {
    386 
    387   for (unsigned i = 0; i < 3; ++i)
    388     Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
    389 }
    390 
    391 Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
    392 
    393   Counters Result = ZeroCounts;
    394 
    395   // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
    396   // but we also want to wait for any other outstanding transfers before
    397   // signalling other hardware blocks
    398   if (MI.getOpcode() == AMDGPU::S_SENDMSG)
    399     return LastIssued;
    400 
    401   // For each register affected by this instruction increase the result
    402   // sequence.
    403   //
    404   // TODO: We could probably just look at explicit operands if we removed VCC /
    405   // EXEC from SMRD dest reg classes.
    406   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
    407     MachineOperand &Op = MI.getOperand(i);
    408     if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
    409       continue;
    410 
    411     const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
    412     RegInterval Interval = getRegInterval(RC, Op);
    413     for (unsigned j = Interval.first; j < Interval.second; ++j) {
    414 
    415       if (Op.isDef()) {
    416         increaseCounters(Result, UsedRegs[j]);
    417         increaseCounters(Result, DefinedRegs[j]);
    418       }
    419 
    420       if (Op.isUse())
    421         increaseCounters(Result, DefinedRegs[j]);
    422     }
    423   }
    424 
    425   return Result;
    426 }
    427 
    428 void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
    429                                   MachineBasicBlock::iterator I) {
    430   if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <
    431       AMDGPUSubtarget::VOLCANIC_ISLANDS)
    432     return;
    433 
    434   // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
    435   if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) {
    436     BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
    437     LastInstWritesM0 = false;
    438     return;
    439   }
    440 
    441   // Set whether this instruction sets M0
    442   LastInstWritesM0 = false;
    443 
    444   unsigned NumOperands = I->getNumOperands();
    445   for (unsigned i = 0; i < NumOperands; i++) {
    446     const MachineOperand &Op = I->getOperand(i);
    447 
    448     if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
    449       LastInstWritesM0 = true;
    450   }
    451 }
    452 
    453 // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
    454 // around other non-memory instructions.
    455 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
    456   bool Changes = false;
    457 
    458   TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
    459   TRI =
    460       static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
    461 
    462   MRI = &MF.getRegInfo();
    463 
    464   WaitedOn = ZeroCounts;
    465   LastIssued = ZeroCounts;
    466   LastOpcodeType = OTHER;
    467   LastInstWritesM0 = false;
    468 
    469   memset(&UsedRegs, 0, sizeof(UsedRegs));
    470   memset(&DefinedRegs, 0, sizeof(DefinedRegs));
    471 
    472   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
    473        BI != BE; ++BI) {
    474 
    475     MachineBasicBlock &MBB = *BI;
    476     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
    477          I != E; ++I) {
    478 
    479       // Wait for everything before a barrier.
    480       if (I->getOpcode() == AMDGPU::S_BARRIER)
    481         Changes |= insertWait(MBB, I, LastIssued);
    482       else
    483         Changes |= insertWait(MBB, I, handleOperands(*I));
    484 
    485       pushInstruction(MBB, I);
    486       handleSendMsg(MBB, I);
    487     }
    488 
    489     // Wait for everything at the end of the MBB
    490     Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
    491   }
    492 
    493   return Changes;
    494 }
    495