Home | History | Annotate | Download | only in AMDGPU
      1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 /// \brief Insert wait instructions for memory reads and writes.
     12 ///
     13 /// Memory reads and writes are issued asynchronously, so we need to insert
     14 /// S_WAITCNT instructions when we want to access any of their results or
     15 /// overwrite any register that's used asynchronously.
     16 //
     17 //===----------------------------------------------------------------------===//
     18 
     19 #include "AMDGPU.h"
     20 #include "AMDGPUSubtarget.h"
     21 #include "SIDefines.h"
     22 #include "SIInstrInfo.h"
     23 #include "SIMachineFunctionInfo.h"
     24 #include "llvm/CodeGen/MachineFunction.h"
     25 #include "llvm/CodeGen/MachineFunctionPass.h"
     26 #include "llvm/CodeGen/MachineInstrBuilder.h"
     27 #include "llvm/CodeGen/MachineRegisterInfo.h"
     28 
     29 #define DEBUG_TYPE "si-insert-waits"
     30 
     31 using namespace llvm;
     32 
     33 namespace {
     34 
     35 /// \brief One variable for each of the hardware counters
     36 typedef union {
     37   struct {
     38     unsigned VM;
     39     unsigned EXP;
     40     unsigned LGKM;
     41   } Named;
     42   unsigned Array[3];
     43 
     44 } Counters;
     45 
     46 typedef enum {
     47   OTHER,
     48   SMEM,
     49   VMEM
     50 } InstType;
     51 
     52 typedef Counters RegCounters[512];
     53 typedef std::pair<unsigned, unsigned> RegInterval;
     54 
     55 class SIInsertWaits : public MachineFunctionPass {
     56 
     57 private:
     58   const SISubtarget *ST;
     59   const SIInstrInfo *TII;
     60   const SIRegisterInfo *TRI;
     61   const MachineRegisterInfo *MRI;
     62 
     63   /// \brief Constant hardware limits
     64   static const Counters WaitCounts;
     65 
     66   /// \brief Constant zero value
     67   static const Counters ZeroCounts;
     68 
     69   /// \brief Counter values we have already waited on.
     70   Counters WaitedOn;
     71 
     72   /// \brief Counter values that we must wait on before the next counter
     73   /// increase.
     74   Counters DelayedWaitOn;
     75 
     76   /// \brief Counter values for last instruction issued.
     77   Counters LastIssued;
     78 
     79   /// \brief Registers used by async instructions.
     80   RegCounters UsedRegs;
     81 
     82   /// \brief Registers defined by async instructions.
     83   RegCounters DefinedRegs;
     84 
     85   /// \brief Different export instruction types seen since last wait.
     86   unsigned ExpInstrTypesSeen;
     87 
     88   /// \brief Type of the last opcode.
     89   InstType LastOpcodeType;
     90 
     91   bool LastInstWritesM0;
     92 
     93   /// \brief Whether the machine function returns void
     94   bool ReturnsVoid;
     95 
     96   /// Whether the VCCZ bit is possibly corrupt
     97   bool VCCZCorrupt;
     98 
     99   /// \brief Get increment/decrement amount for this instruction.
    100   Counters getHwCounts(MachineInstr &MI);
    101 
    102   /// \brief Is operand relevant for async execution?
    103   bool isOpRelevant(MachineOperand &Op);
    104 
    105   /// \brief Get register interval an operand affects.
    106   RegInterval getRegInterval(const TargetRegisterClass *RC,
    107                              const MachineOperand &Reg) const;
    108 
    109   /// \brief Handle instructions async components
    110   void pushInstruction(MachineBasicBlock &MBB,
    111                        MachineBasicBlock::iterator I,
    112                        const Counters& Increment);
    113 
    114   /// \brief Insert the actual wait instruction
    115   bool insertWait(MachineBasicBlock &MBB,
    116                   MachineBasicBlock::iterator I,
    117                   const Counters &Counts);
    118 
    119   /// \brief Handle existing wait instructions (from intrinsics)
    120   void handleExistingWait(MachineBasicBlock::iterator I);
    121 
    122   /// \brief Do we need def2def checks?
    123   bool unorderedDefines(MachineInstr &MI);
    124 
    125   /// \brief Resolve all operand dependencies to counter requirements
    126   Counters handleOperands(MachineInstr &MI);
    127 
    128   /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
    129   void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
    130 
    131   /// Return true if there are LGKM instrucitons that haven't been waited on
    132   /// yet.
    133   bool hasOutstandingLGKM() const;
    134 
    135 public:
    136   static char ID;
    137 
    138   SIInsertWaits() :
    139     MachineFunctionPass(ID),
    140     ST(nullptr),
    141     TII(nullptr),
    142     TRI(nullptr),
    143     ExpInstrTypesSeen(0),
    144     VCCZCorrupt(false) { }
    145 
    146   bool runOnMachineFunction(MachineFunction &MF) override;
    147 
    148   const char *getPassName() const override {
    149     return "SI insert wait instructions";
    150   }
    151 
    152   void getAnalysisUsage(AnalysisUsage &AU) const override {
    153     AU.setPreservesCFG();
    154     MachineFunctionPass::getAnalysisUsage(AU);
    155   }
    156 };
    157 
    158 } // End anonymous namespace
    159 
    160 INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
    161                       "SI Insert Waits", false, false)
    162 INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE,
    163                     "SI Insert Waits", false, false)
    164 
    165 char SIInsertWaits::ID = 0;
    166 
    167 char &llvm::SIInsertWaitsID = SIInsertWaits::ID;
    168 
    169 FunctionPass *llvm::createSIInsertWaitsPass() {
    170   return new SIInsertWaits();
    171 }
    172 
    173 const Counters SIInsertWaits::WaitCounts = { { 15, 7, 15 } };
    174 const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
    175 
    176 static bool readsVCCZ(unsigned Opcode) {
    177   return Opcode == AMDGPU::S_CBRANCH_VCCNZ || Opcode == AMDGPU::S_CBRANCH_VCCZ;
    178 }
    179 
    180 bool SIInsertWaits::hasOutstandingLGKM() const {
    181   return WaitedOn.Named.LGKM != LastIssued.Named.LGKM;
    182 }
    183 
    184 Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
    185   uint64_t TSFlags = MI.getDesc().TSFlags;
    186   Counters Result = { { 0, 0, 0 } };
    187 
    188   Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
    189 
    190   // Only consider stores or EXP for EXP_CNT
    191   Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
    192       (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore()));
    193 
    194   // LGKM may uses larger values
    195   if (TSFlags & SIInstrFlags::LGKM_CNT) {
    196 
    197     if (TII->isSMRD(MI)) {
    198 
    199       if (MI.getNumOperands() != 0) {
    200         assert(MI.getOperand(0).isReg() &&
    201                "First LGKM operand must be a register!");
    202 
    203         // XXX - What if this is a write into a super register?
    204         const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
    205         unsigned Size = RC->getSize();
    206         Result.Named.LGKM = Size > 4 ? 2 : 1;
    207       } else {
    208         // s_dcache_inv etc. do not have a a destination register. Assume we
    209         // want a wait on these.
    210         // XXX - What is the right value?
    211         Result.Named.LGKM = 1;
    212       }
    213     } else {
    214       // DS
    215       Result.Named.LGKM = 1;
    216     }
    217 
    218   } else {
    219     Result.Named.LGKM = 0;
    220   }
    221 
    222   return Result;
    223 }
    224 
    225 bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
    226   // Constants are always irrelevant
    227   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
    228     return false;
    229 
    230   // Defines are always relevant
    231   if (Op.isDef())
    232     return true;
    233 
    234   // For exports all registers are relevant
    235   MachineInstr &MI = *Op.getParent();
    236   if (MI.getOpcode() == AMDGPU::EXP)
    237     return true;
    238 
    239   // For stores the stored value is also relevant
    240   if (!MI.getDesc().mayStore())
    241     return false;
    242 
    243   // Check if this operand is the value being stored.
    244   // Special case for DS/FLAT instructions, since the address
    245   // operand comes before the value operand and it may have
    246   // multiple data operands.
    247 
    248   if (TII->isDS(MI) || TII->isFLAT(MI)) {
    249     MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data);
    250     if (Data && Op.isIdenticalTo(*Data))
    251       return true;
    252   }
    253 
    254   if (TII->isDS(MI)) {
    255     MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
    256     if (Data0 && Op.isIdenticalTo(*Data0))
    257       return true;
    258 
    259     MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
    260     return Data1 && Op.isIdenticalTo(*Data1);
    261   }
    262 
    263   // NOTE: This assumes that the value operand is before the
    264   // address operand, and that there is only one value operand.
    265   for (MachineInstr::mop_iterator I = MI.operands_begin(),
    266        E = MI.operands_end(); I != E; ++I) {
    267 
    268     if (I->isReg() && I->isUse())
    269       return Op.isIdenticalTo(*I);
    270   }
    271 
    272   return false;
    273 }
    274 
    275 RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
    276                                           const MachineOperand &Reg) const {
    277   unsigned Size = RC->getSize();
    278   assert(Size >= 4);
    279 
    280   RegInterval Result;
    281   Result.first = TRI->getEncodingValue(Reg.getReg());
    282   Result.second = Result.first + Size / 4;
    283 
    284   return Result;
    285 }
    286 
    287 void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
    288                                     MachineBasicBlock::iterator I,
    289                                     const Counters &Increment) {
    290 
    291   // Get the hardware counter increments and sum them up
    292   Counters Limit = ZeroCounts;
    293   unsigned Sum = 0;
    294 
    295   for (unsigned i = 0; i < 3; ++i) {
    296     LastIssued.Array[i] += Increment.Array[i];
    297     if (Increment.Array[i])
    298       Limit.Array[i] = LastIssued.Array[i];
    299     Sum += Increment.Array[i];
    300   }
    301 
    302   // If we don't increase anything then that's it
    303   if (Sum == 0) {
    304     LastOpcodeType = OTHER;
    305     return;
    306   }
    307 
    308   if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
    309     // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
    310     // or SMEM clause, respectively.
    311     //
    312     // The temporary workaround is to break the clauses with S_NOP.
    313     //
    314     // The proper solution would be to allocate registers such that all source
    315     // and destination registers don't overlap, e.g. this is illegal:
    316     //   r0 = load r2
    317     //   r2 = load r0
    318     if (LastOpcodeType == VMEM && Increment.Named.VM) {
    319       // Insert a NOP to break the clause.
    320       BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
    321           .addImm(0);
    322       LastInstWritesM0 = false;
    323     }
    324 
    325     if (TII->isSMRD(*I))
    326       LastOpcodeType = SMEM;
    327     else if (Increment.Named.VM)
    328       LastOpcodeType = VMEM;
    329   }
    330 
    331   // Remember which export instructions we have seen
    332   if (Increment.Named.EXP) {
    333     ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2;
    334   }
    335 
    336   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
    337     MachineOperand &Op = I->getOperand(i);
    338     if (!isOpRelevant(Op))
    339       continue;
    340 
    341     const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
    342     RegInterval Interval = getRegInterval(RC, Op);
    343     for (unsigned j = Interval.first; j < Interval.second; ++j) {
    344 
    345       // Remember which registers we define
    346       if (Op.isDef())
    347         DefinedRegs[j] = Limit;
    348 
    349       // and which one we are using
    350       if (Op.isUse())
    351         UsedRegs[j] = Limit;
    352     }
    353   }
    354 }
    355 
    356 bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
    357                                MachineBasicBlock::iterator I,
    358                                const Counters &Required) {
    359 
    360   // End of program? No need to wait on anything
    361   // A function not returning void needs to wait, because other bytecode will
    362   // be appended after it and we don't know what it will be.
    363   if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
    364     return false;
    365 
    366   // Figure out if the async instructions execute in order
    367   bool Ordered[3];
    368 
    369   // VM_CNT is always ordered
    370   Ordered[0] = true;
    371 
    372   // EXP_CNT is unordered if we have both EXP & VM-writes
    373   Ordered[1] = ExpInstrTypesSeen == 3;
    374 
    375   // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
    376   Ordered[2] = false;
    377 
    378   // The values we are going to put into the S_WAITCNT instruction
    379   Counters Counts = WaitCounts;
    380 
    381   // Do we really need to wait?
    382   bool NeedWait = false;
    383 
    384   for (unsigned i = 0; i < 3; ++i) {
    385 
    386     if (Required.Array[i] <= WaitedOn.Array[i])
    387       continue;
    388 
    389     NeedWait = true;
    390 
    391     if (Ordered[i]) {
    392       unsigned Value = LastIssued.Array[i] - Required.Array[i];
    393 
    394       // Adjust the value to the real hardware possibilities.
    395       Counts.Array[i] = std::min(Value, WaitCounts.Array[i]);
    396 
    397     } else
    398       Counts.Array[i] = 0;
    399 
    400     // Remember on what we have waited on.
    401     WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
    402   }
    403 
    404   if (!NeedWait)
    405     return false;
    406 
    407   // Reset EXP_CNT instruction types
    408   if (Counts.Named.EXP == 0)
    409     ExpInstrTypesSeen = 0;
    410 
    411   // Build the wait instruction
    412   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
    413           .addImm((Counts.Named.VM & 0xF) |
    414                   ((Counts.Named.EXP & 0x7) << 4) |
    415                   ((Counts.Named.LGKM & 0xF) << 8));
    416 
    417   LastOpcodeType = OTHER;
    418   LastInstWritesM0 = false;
    419   return true;
    420 }
    421 
    422 /// \brief helper function for handleOperands
    423 static void increaseCounters(Counters &Dst, const Counters &Src) {
    424 
    425   for (unsigned i = 0; i < 3; ++i)
    426     Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
    427 }
    428 
    429 /// \brief check whether any of the counters is non-zero
    430 static bool countersNonZero(const Counters &Counter) {
    431   for (unsigned i = 0; i < 3; ++i)
    432     if (Counter.Array[i])
    433       return true;
    434   return false;
    435 }
    436 
    437 void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
    438   assert(I->getOpcode() == AMDGPU::S_WAITCNT);
    439 
    440   unsigned Imm = I->getOperand(0).getImm();
    441   Counters Counts, WaitOn;
    442 
    443   Counts.Named.VM = Imm & 0xF;
    444   Counts.Named.EXP = (Imm >> 4) & 0x7;
    445   Counts.Named.LGKM = (Imm >> 8) & 0xF;
    446 
    447   for (unsigned i = 0; i < 3; ++i) {
    448     if (Counts.Array[i] <= LastIssued.Array[i])
    449       WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
    450     else
    451       WaitOn.Array[i] = 0;
    452   }
    453 
    454   increaseCounters(DelayedWaitOn, WaitOn);
    455 }
    456 
    457 Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
    458 
    459   Counters Result = ZeroCounts;
    460 
    461   // For each register affected by this instruction increase the result
    462   // sequence.
    463   //
    464   // TODO: We could probably just look at explicit operands if we removed VCC /
    465   // EXEC from SMRD dest reg classes.
    466   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
    467     MachineOperand &Op = MI.getOperand(i);
    468     if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
    469       continue;
    470 
    471     const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
    472     RegInterval Interval = getRegInterval(RC, Op);
    473     for (unsigned j = Interval.first; j < Interval.second; ++j) {
    474 
    475       if (Op.isDef()) {
    476         increaseCounters(Result, UsedRegs[j]);
    477         increaseCounters(Result, DefinedRegs[j]);
    478       }
    479 
    480       if (Op.isUse())
    481         increaseCounters(Result, DefinedRegs[j]);
    482     }
    483   }
    484 
    485   return Result;
    486 }
    487 
    488 void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
    489                                   MachineBasicBlock::iterator I) {
    490   if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
    491     return;
    492 
    493   // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
    494   if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) {
    495     BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
    496     LastInstWritesM0 = false;
    497     return;
    498   }
    499 
    500   // Set whether this instruction sets M0
    501   LastInstWritesM0 = false;
    502 
    503   unsigned NumOperands = I->getNumOperands();
    504   for (unsigned i = 0; i < NumOperands; i++) {
    505     const MachineOperand &Op = I->getOperand(i);
    506 
    507     if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
    508       LastInstWritesM0 = true;
    509   }
    510 }
    511 
    512 // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
    513 // around other non-memory instructions.
    514 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
    515   bool Changes = false;
    516 
    517   ST = &MF.getSubtarget<SISubtarget>();
    518   TII = ST->getInstrInfo();
    519   TRI = &TII->getRegisterInfo();
    520   MRI = &MF.getRegInfo();
    521 
    522   WaitedOn = ZeroCounts;
    523   DelayedWaitOn = ZeroCounts;
    524   LastIssued = ZeroCounts;
    525   LastOpcodeType = OTHER;
    526   LastInstWritesM0 = false;
    527   ReturnsVoid = MF.getInfo<SIMachineFunctionInfo>()->returnsVoid();
    528 
    529   memset(&UsedRegs, 0, sizeof(UsedRegs));
    530   memset(&DefinedRegs, 0, sizeof(DefinedRegs));
    531 
    532   SmallVector<MachineInstr *, 4> RemoveMI;
    533 
    534   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
    535        BI != BE; ++BI) {
    536 
    537     MachineBasicBlock &MBB = *BI;
    538     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
    539          I != E; ++I) {
    540 
    541       if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
    542         // There is a hardware bug on CI/SI where SMRD instruction may corrupt
    543         // vccz bit, so when we detect that an instruction may read from a
    544         // corrupt vccz bit, we need to:
    545         // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
    546         //    complete.
    547         // 2. Restore the correct value of vccz by writing the current value
    548         //    of vcc back to vcc.
    549 
    550         if (TII->isSMRD(I->getOpcode())) {
    551           VCCZCorrupt = true;
    552         } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) {
    553           // FIXME: We only care about SMRD instructions here, not LDS or GDS.
    554           // Whenever we store a value in vcc, the correct value of vccz is
    555           // restored.
    556           VCCZCorrupt = false;
    557         }
    558 
    559         // Check if we need to apply the bug work-around
    560         if (readsVCCZ(I->getOpcode()) && VCCZCorrupt) {
    561           DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
    562 
    563           // Wait on everything, not just LGKM.  vccz reads usually come from
    564           // terminators, and we always wait on everything at the end of the
    565           // block, so if we only wait on LGKM here, we might end up with
    566           // another s_waitcnt inserted right after this if there are non-LGKM
    567           // instructions still outstanding.
    568           insertWait(MBB, I, LastIssued);
    569 
    570           // Restore the vccz bit.  Any time a value is written to vcc, the vcc
    571           // bit is updated, so we can restore the bit by reading the value of
    572           // vcc and then writing it back to the register.
    573           BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
    574                   AMDGPU::VCC)
    575                   .addReg(AMDGPU::VCC);
    576         }
    577       }
    578 
    579       // Record pre-existing, explicitly requested waits
    580       if (I->getOpcode() == AMDGPU::S_WAITCNT) {
    581         handleExistingWait(*I);
    582         RemoveMI.push_back(&*I);
    583         continue;
    584       }
    585 
    586       Counters Required;
    587 
    588       // Wait for everything before a barrier.
    589       //
    590       // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
    591       // but we also want to wait for any other outstanding transfers before
    592       // signalling other hardware blocks
    593       if (I->getOpcode() == AMDGPU::S_BARRIER ||
    594           I->getOpcode() == AMDGPU::S_SENDMSG)
    595         Required = LastIssued;
    596       else
    597         Required = handleOperands(*I);
    598 
    599       Counters Increment = getHwCounts(*I);
    600 
    601       if (countersNonZero(Required) || countersNonZero(Increment))
    602         increaseCounters(Required, DelayedWaitOn);
    603 
    604       Changes |= insertWait(MBB, I, Required);
    605 
    606       pushInstruction(MBB, I, Increment);
    607       handleSendMsg(MBB, I);
    608     }
    609 
    610     // Wait for everything at the end of the MBB
    611     Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
    612   }
    613 
    614   for (MachineInstr *I : RemoveMI)
    615     I->eraseFromParent();
    616 
    617   return Changes;
    618 }
    619