Home | History | Annotate | Download | only in AMDGPU
      1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 /// Insert wait instructions for memory reads and writes.
     12 ///
     13 /// Memory reads and writes are issued asynchronously, so we need to insert
     14 /// S_WAITCNT instructions when we want to access any of their results or
     15 /// overwrite any register that's used asynchronously.
     16 //
     17 //===----------------------------------------------------------------------===//
     18 
     19 #include "AMDGPU.h"
     20 #include "AMDGPUSubtarget.h"
     21 #include "SIDefines.h"
     22 #include "SIInstrInfo.h"
     23 #include "SIMachineFunctionInfo.h"
     24 #include "SIRegisterInfo.h"
     25 #include "Utils/AMDGPUBaseInfo.h"
     26 #include "llvm/ADT/DenseMap.h"
     27 #include "llvm/ADT/DenseSet.h"
     28 #include "llvm/ADT/PostOrderIterator.h"
     29 #include "llvm/ADT/STLExtras.h"
     30 #include "llvm/ADT/SmallVector.h"
     31 #include "llvm/CodeGen/MachineBasicBlock.h"
     32 #include "llvm/CodeGen/MachineFunction.h"
     33 #include "llvm/CodeGen/MachineFunctionPass.h"
     34 #include "llvm/CodeGen/MachineInstr.h"
     35 #include "llvm/CodeGen/MachineInstrBuilder.h"
     36 #include "llvm/CodeGen/MachineLoopInfo.h"
     37 #include "llvm/CodeGen/MachineMemOperand.h"
     38 #include "llvm/CodeGen/MachineOperand.h"
     39 #include "llvm/CodeGen/MachineRegisterInfo.h"
     40 #include "llvm/IR/DebugLoc.h"
     41 #include "llvm/Pass.h"
     42 #include "llvm/Support/Debug.h"
     43 #include "llvm/Support/DebugCounter.h"
     44 #include "llvm/Support/ErrorHandling.h"
     45 #include "llvm/Support/raw_ostream.h"
     46 #include <algorithm>
     47 #include <cassert>
     48 #include <cstdint>
     49 #include <cstring>
     50 #include <memory>
     51 #include <utility>
     52 #include <vector>
     53 
     54 using namespace llvm;
     55 
     56 #define DEBUG_TYPE "si-insert-waitcnts"
     57 
     58 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
     59               "Force emit s_waitcnt expcnt(0) instrs");
     60 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
     61               "Force emit s_waitcnt lgkmcnt(0) instrs");
     62 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
     63               "Force emit s_waitcnt vmcnt(0) instrs");
     64 
     65 static cl::opt<unsigned> ForceEmitZeroFlag(
     66   "amdgpu-waitcnt-forcezero",
     67   cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
     68   cl::init(0), cl::Hidden);
     69 
     70 namespace {
     71 
     72 // Class of object that encapsulates latest instruction counter score
     73 // associated with the operand.  Used for determining whether
     74 // s_waitcnt instruction needs to be emited.
     75 
     76 #define CNT_MASK(t) (1u << (t))
     77 
     78 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
     79 
     80 using RegInterval = std::pair<signed, signed>;
     81 
     82 struct {
     83   int32_t VmcntMax;
     84   int32_t ExpcntMax;
     85   int32_t LgkmcntMax;
     86   int32_t NumVGPRsMax;
     87   int32_t NumSGPRsMax;
     88 } HardwareLimits;
     89 
     90 struct {
     91   unsigned VGPR0;
     92   unsigned VGPRL;
     93   unsigned SGPR0;
     94   unsigned SGPRL;
     95 } RegisterEncoding;
     96 
     97 enum WaitEventType {
     98   VMEM_ACCESS,      // vector-memory read & write
     99   LDS_ACCESS,       // lds read & write
    100   GDS_ACCESS,       // gds read & write
    101   SQ_MESSAGE,       // send message
    102   SMEM_ACCESS,      // scalar-memory read & write
    103   EXP_GPR_LOCK,     // export holding on its data src
    104   GDS_GPR_LOCK,     // GDS holding on its data and addr src
    105   EXP_POS_ACCESS,   // write to export position
    106   EXP_PARAM_ACCESS, // write to export parameter
    107   VMW_GPR_LOCK,     // vector-memory write holding on its data src
    108   NUM_WAIT_EVENTS,
    109 };
    110 
    111 // The mapping is:
    112 //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
    113 //  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
    114 //  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
    115 // We reserve a fixed number of VGPR slots in the scoring tables for
    116 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
    117 enum RegisterMapping {
    118   SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
    119   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
    120   NUM_EXTRA_VGPRS = 1,    // A reserved slot for DS.
    121   EXTRA_VGPR_LDS = 0,     // This is a placeholder the Shader algorithm uses.
    122   NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
    123 };
    124 
    125 #define ForAllWaitEventType(w)                                                 \
    126   for (enum WaitEventType w = (enum WaitEventType)0;                           \
    127        (w) < (enum WaitEventType)NUM_WAIT_EVENTS;                              \
    128        (w) = (enum WaitEventType)((w) + 1))
    129 
    130 // This is a per-basic-block object that maintains current score brackets
    131 // of each wait counter, and a per-register scoreboard for each wait counter.
    132 // We also maintain the latest score for every event type that can change the
    133 // waitcnt in order to know if there are multiple types of events within
    134 // the brackets. When multiple types of event happen in the bracket,
    135 // wait count may get decreased out of order, therefore we need to put in
    136 // "s_waitcnt 0" before use.
    137 class BlockWaitcntBrackets {
    138 public:
    139   BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
    140     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    141          T = (enum InstCounterType)(T + 1)) {
    142       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
    143     }
    144   }
    145 
    146   ~BlockWaitcntBrackets() = default;
    147 
    148   static int32_t getWaitCountMax(InstCounterType T) {
    149     switch (T) {
    150     case VM_CNT:
    151       return HardwareLimits.VmcntMax;
    152     case LGKM_CNT:
    153       return HardwareLimits.LgkmcntMax;
    154     case EXP_CNT:
    155       return HardwareLimits.ExpcntMax;
    156     default:
    157       break;
    158     }
    159     return 0;
    160   }
    161 
    162   void setScoreLB(InstCounterType T, int32_t Val) {
    163     assert(T < NUM_INST_CNTS);
    164     if (T >= NUM_INST_CNTS)
    165       return;
    166     ScoreLBs[T] = Val;
    167   }
    168 
    169   void setScoreUB(InstCounterType T, int32_t Val) {
    170     assert(T < NUM_INST_CNTS);
    171     if (T >= NUM_INST_CNTS)
    172       return;
    173     ScoreUBs[T] = Val;
    174     if (T == EXP_CNT) {
    175       int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
    176       if (ScoreLBs[T] < UB)
    177         ScoreLBs[T] = UB;
    178     }
    179   }
    180 
    181   int32_t getScoreLB(InstCounterType T) {
    182     assert(T < NUM_INST_CNTS);
    183     if (T >= NUM_INST_CNTS)
    184       return 0;
    185     return ScoreLBs[T];
    186   }
    187 
    188   int32_t getScoreUB(InstCounterType T) {
    189     assert(T < NUM_INST_CNTS);
    190     if (T >= NUM_INST_CNTS)
    191       return 0;
    192     return ScoreUBs[T];
    193   }
    194 
    195   // Mapping from event to counter.
    196   InstCounterType eventCounter(WaitEventType E) {
    197     switch (E) {
    198     case VMEM_ACCESS:
    199       return VM_CNT;
    200     case LDS_ACCESS:
    201     case GDS_ACCESS:
    202     case SQ_MESSAGE:
    203     case SMEM_ACCESS:
    204       return LGKM_CNT;
    205     case EXP_GPR_LOCK:
    206     case GDS_GPR_LOCK:
    207     case VMW_GPR_LOCK:
    208     case EXP_POS_ACCESS:
    209     case EXP_PARAM_ACCESS:
    210       return EXP_CNT;
    211     default:
    212       llvm_unreachable("unhandled event type");
    213     }
    214     return NUM_INST_CNTS;
    215   }
    216 
    217   void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
    218     if (GprNo < NUM_ALL_VGPRS) {
    219       if (GprNo > VgprUB) {
    220         VgprUB = GprNo;
    221       }
    222       VgprScores[T][GprNo] = Val;
    223     } else {
    224       assert(T == LGKM_CNT);
    225       if (GprNo - NUM_ALL_VGPRS > SgprUB) {
    226         SgprUB = GprNo - NUM_ALL_VGPRS;
    227       }
    228       SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
    229     }
    230   }
    231 
    232   int32_t getRegScore(int GprNo, InstCounterType T) {
    233     if (GprNo < NUM_ALL_VGPRS) {
    234       return VgprScores[T][GprNo];
    235     }
    236     return SgprScores[GprNo - NUM_ALL_VGPRS];
    237   }
    238 
    239   void clear() {
    240     memset(ScoreLBs, 0, sizeof(ScoreLBs));
    241     memset(ScoreUBs, 0, sizeof(ScoreUBs));
    242     memset(EventUBs, 0, sizeof(EventUBs));
    243     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    244          T = (enum InstCounterType)(T + 1)) {
    245       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
    246     }
    247     memset(SgprScores, 0, sizeof(SgprScores));
    248   }
    249 
    250   RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
    251                              const MachineRegisterInfo *MRI,
    252                              const SIRegisterInfo *TRI, unsigned OpNo,
    253                              bool Def) const;
    254 
    255   void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
    256                    const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
    257                    unsigned OpNo, int32_t Val);
    258 
    259   void setWaitAtBeginning() { WaitAtBeginning = true; }
    260   void clearWaitAtBeginning() { WaitAtBeginning = false; }
    261   bool getWaitAtBeginning() const { return WaitAtBeginning; }
    262   void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
    263   int32_t getMaxVGPR() const { return VgprUB; }
    264   int32_t getMaxSGPR() const { return SgprUB; }
    265 
    266   int32_t getEventUB(enum WaitEventType W) const {
    267     assert(W < NUM_WAIT_EVENTS);
    268     return EventUBs[W];
    269   }
    270 
    271   bool counterOutOfOrder(InstCounterType T);
    272   unsigned int updateByWait(InstCounterType T, int ScoreToWait);
    273   void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
    274                      const MachineRegisterInfo *MRI, WaitEventType E,
    275                      MachineInstr &MI);
    276 
    277   bool hasPendingSMEM() const {
    278     return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
    279             EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
    280   }
    281 
    282   bool hasPendingFlat() const {
    283     return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
    284              LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
    285             (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
    286              LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
    287   }
    288 
    289   void setPendingFlat() {
    290     LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
    291     LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
    292   }
    293 
    294   int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
    295 
    296   void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
    297 
    298   bool getRevisitLoop() const { return RevisitLoop; }
    299   void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
    300 
    301   void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
    302   int32_t getPostOrder() const { return PostOrder; }
    303 
    304   void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
    305   void clearWaitcnt() { Waitcnt = nullptr; }
    306   MachineInstr *getWaitcnt() const { return Waitcnt; }
    307 
    308   bool mixedExpTypes() const { return MixedExpTypes; }
    309   void setMixedExpTypes(bool MixedExpTypesIn) {
    310     MixedExpTypes = MixedExpTypesIn;
    311   }
    312 
    313   void print(raw_ostream &);
    314   void dump() { print(dbgs()); }
    315 
    316 private:
    317   const GCNSubtarget *ST = nullptr;
    318   bool WaitAtBeginning = false;
    319   bool RevisitLoop = false;
    320   bool MixedExpTypes = false;
    321   int32_t PostOrder = 0;
    322   MachineInstr *Waitcnt = nullptr;
    323   int32_t ScoreLBs[NUM_INST_CNTS] = {0};
    324   int32_t ScoreUBs[NUM_INST_CNTS] = {0};
    325   int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
    326   // Remember the last flat memory operation.
    327   int32_t LastFlat[NUM_INST_CNTS] = {0};
    328   // wait_cnt scores for every vgpr.
    329   // Keep track of the VgprUB and SgprUB to make merge at join efficient.
    330   int32_t VgprUB = 0;
    331   int32_t SgprUB = 0;
    332   int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
    333   // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
    334   int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
    335 };
    336 
    337 // This is a per-loop-region object that records waitcnt status at the end of
    338 // loop footer from the previous iteration. We also maintain an iteration
    339 // count to track the number of times the loop has been visited. When it
    340 // doesn't converge naturally, we force convergence by inserting s_waitcnt 0
    341 // at the end of the loop footer.
    342 class LoopWaitcntData {
    343 public:
    344   LoopWaitcntData() = default;
    345   ~LoopWaitcntData() = default;
    346 
    347   void incIterCnt() { IterCnt++; }
    348   void resetIterCnt() { IterCnt = 0; }
    349   unsigned getIterCnt() { return IterCnt; }
    350 
    351   void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
    352   MachineInstr *getWaitcnt() const { return LfWaitcnt; }
    353 
    354   void print() { LLVM_DEBUG(dbgs() << "  iteration " << IterCnt << '\n';); }
    355 
    356 private:
    357   // s_waitcnt added at the end of loop footer to stablize wait scores
    358   // at the end of the loop footer.
    359   MachineInstr *LfWaitcnt = nullptr;
    360   // Number of iterations the loop has been visited, not including the initial
    361   // walk over.
    362   int32_t IterCnt = 0;
    363 };
    364 
    365 class SIInsertWaitcnts : public MachineFunctionPass {
    366 private:
    367   const GCNSubtarget *ST = nullptr;
    368   const SIInstrInfo *TII = nullptr;
    369   const SIRegisterInfo *TRI = nullptr;
    370   const MachineRegisterInfo *MRI = nullptr;
    371   const MachineLoopInfo *MLI = nullptr;
    372   AMDGPU::IsaInfo::IsaVersion IV;
    373   AMDGPUAS AMDGPUASI;
    374 
    375   DenseSet<MachineBasicBlock *> BlockVisitedSet;
    376   DenseSet<MachineInstr *> TrackedWaitcntSet;
    377   DenseSet<MachineInstr *> VCCZBugHandledSet;
    378 
    379   DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
    380       BlockWaitcntBracketsMap;
    381 
    382   std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
    383 
    384   DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
    385 
    386   std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
    387 
    388   // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
    389   // because of amdgpu-waitcnt-forcezero flag
    390   bool ForceEmitZeroWaitcnts;
    391   bool ForceEmitWaitcnt[NUM_INST_CNTS];
    392 
    393 public:
    394   static char ID;
    395 
    396   SIInsertWaitcnts() : MachineFunctionPass(ID) {
    397     (void)ForceExpCounter;
    398     (void)ForceLgkmCounter;
    399     (void)ForceVMCounter;
    400   }
    401 
    402   bool runOnMachineFunction(MachineFunction &MF) override;
    403 
    404   StringRef getPassName() const override {
    405     return "SI insert wait instructions";
    406   }
    407 
    408   void getAnalysisUsage(AnalysisUsage &AU) const override {
    409     AU.setPreservesCFG();
    410     AU.addRequired<MachineLoopInfo>();
    411     MachineFunctionPass::getAnalysisUsage(AU);
    412   }
    413 
    414   void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
    415     // The waitcnt information is copied because it changes as the block is
    416     // traversed.
    417     KillWaitBrackets.push_back(
    418         llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
    419   }
    420 
    421   bool isForceEmitWaitcnt() const {
    422     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    423          T = (enum InstCounterType)(T + 1))
    424       if (ForceEmitWaitcnt[T])
    425         return true;
    426     return false;
    427   }
    428 
    429   void setForceEmitWaitcnt() {
    430 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
    431 // For debug builds, get the debug counter info and adjust if need be
    432 #ifndef NDEBUG
    433     if (DebugCounter::isCounterSet(ForceExpCounter) &&
    434         DebugCounter::shouldExecute(ForceExpCounter)) {
    435       ForceEmitWaitcnt[EXP_CNT] = true;
    436     } else {
    437       ForceEmitWaitcnt[EXP_CNT] = false;
    438     }
    439 
    440     if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
    441          DebugCounter::shouldExecute(ForceLgkmCounter)) {
    442       ForceEmitWaitcnt[LGKM_CNT] = true;
    443     } else {
    444       ForceEmitWaitcnt[LGKM_CNT] = false;
    445     }
    446 
    447     if (DebugCounter::isCounterSet(ForceVMCounter) &&
    448         DebugCounter::shouldExecute(ForceVMCounter)) {
    449       ForceEmitWaitcnt[VM_CNT] = true;
    450     } else {
    451       ForceEmitWaitcnt[VM_CNT] = false;
    452     }
    453 #endif // NDEBUG
    454   }
    455 
    456   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
    457   void generateWaitcntInstBefore(MachineInstr &MI,
    458                                   BlockWaitcntBrackets *ScoreBrackets);
    459   void updateEventWaitcntAfter(MachineInstr &Inst,
    460                                BlockWaitcntBrackets *ScoreBrackets);
    461   void mergeInputScoreBrackets(MachineBasicBlock &Block);
    462   bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
    463   unsigned countNumBottomBlocks(const MachineLoop *Loop);
    464   void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
    465   void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
    466   bool isWaitcntStronger(unsigned LHS, unsigned RHS);
    467   unsigned combineWaitcnt(unsigned LHS, unsigned RHS);
    468 };
    469 
    470 } // end anonymous namespace
    471 
    472 RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
    473                                                  const SIInstrInfo *TII,
    474                                                  const MachineRegisterInfo *MRI,
    475                                                  const SIRegisterInfo *TRI,
    476                                                  unsigned OpNo,
    477                                                  bool Def) const {
    478   const MachineOperand &Op = MI->getOperand(OpNo);
    479   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
    480       (Def && !Op.isDef()))
    481     return {-1, -1};
    482 
    483   // A use via a PW operand does not need a waitcnt.
    484   // A partial write is not a WAW.
    485   assert(!Op.getSubReg() || !Op.isUndef());
    486 
    487   RegInterval Result;
    488   const MachineRegisterInfo &MRIA = *MRI;
    489 
    490   unsigned Reg = TRI->getEncodingValue(Op.getReg());
    491 
    492   if (TRI->isVGPR(MRIA, Op.getReg())) {
    493     assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
    494     Result.first = Reg - RegisterEncoding.VGPR0;
    495     assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
    496   } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
    497     assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
    498     Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
    499     assert(Result.first >= NUM_ALL_VGPRS &&
    500            Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
    501   }
    502   // TODO: Handle TTMP
    503   // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
    504   else
    505     return {-1, -1};
    506 
    507   const MachineInstr &MIA = *MI;
    508   const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
    509   unsigned Size = TRI->getRegSizeInBits(*RC);
    510   Result.second = Result.first + (Size / 32);
    511 
    512   return Result;
    513 }
    514 
    515 void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
    516                                        const SIInstrInfo *TII,
    517                                        const SIRegisterInfo *TRI,
    518                                        const MachineRegisterInfo *MRI,
    519                                        unsigned OpNo, int32_t Val) {
    520   RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
    521   LLVM_DEBUG({
    522     const MachineOperand &Opnd = MI->getOperand(OpNo);
    523     assert(TRI->isVGPR(*MRI, Opnd.getReg()));
    524   });
    525   for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
    526     setRegScore(RegNo, EXP_CNT, Val);
    527   }
    528 }
    529 
    530 void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
    531                                          const SIRegisterInfo *TRI,
    532                                          const MachineRegisterInfo *MRI,
    533                                          WaitEventType E, MachineInstr &Inst) {
    534   const MachineRegisterInfo &MRIA = *MRI;
    535   InstCounterType T = eventCounter(E);
    536   int32_t CurrScore = getScoreUB(T) + 1;
    537   // EventUB and ScoreUB need to be update regardless if this event changes
    538   // the score of a register or not.
    539   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
    540   EventUBs[E] = CurrScore;
    541   setScoreUB(T, CurrScore);
    542 
    543   if (T == EXP_CNT) {
    544     // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
    545     // is required.
    546     if (!MixedExpTypes) {
    547       MixedExpTypes = counterOutOfOrder(EXP_CNT);
    548     }
    549 
    550     // Put score on the source vgprs. If this is a store, just use those
    551     // specific register(s).
    552     if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
    553       // All GDS operations must protect their address register (same as
    554       // export.)
    555       if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
    556           Inst.getOpcode() != AMDGPU::DS_CONSUME) {
    557         setExpScore(
    558             &Inst, TII, TRI, MRI,
    559             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
    560             CurrScore);
    561       }
    562       if (Inst.mayStore()) {
    563         setExpScore(
    564             &Inst, TII, TRI, MRI,
    565             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
    566             CurrScore);
    567         if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
    568                                        AMDGPU::OpName::data1) != -1) {
    569           setExpScore(&Inst, TII, TRI, MRI,
    570                       AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
    571                                                  AMDGPU::OpName::data1),
    572                       CurrScore);
    573         }
    574       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
    575                  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
    576                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
    577                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
    578                  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
    579                  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
    580                  Inst.getOpcode() != AMDGPU::DS_APPEND &&
    581                  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
    582                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
    583         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
    584           const MachineOperand &Op = Inst.getOperand(I);
    585           if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
    586             setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
    587           }
    588         }
    589       }
    590     } else if (TII->isFLAT(Inst)) {
    591       if (Inst.mayStore()) {
    592         setExpScore(
    593             &Inst, TII, TRI, MRI,
    594             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
    595             CurrScore);
    596       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
    597         setExpScore(
    598             &Inst, TII, TRI, MRI,
    599             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
    600             CurrScore);
    601       }
    602     } else if (TII->isMIMG(Inst)) {
    603       if (Inst.mayStore()) {
    604         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
    605       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
    606         setExpScore(
    607             &Inst, TII, TRI, MRI,
    608             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
    609             CurrScore);
    610       }
    611     } else if (TII->isMTBUF(Inst)) {
    612       if (Inst.mayStore()) {
    613         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
    614       }
    615     } else if (TII->isMUBUF(Inst)) {
    616       if (Inst.mayStore()) {
    617         setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
    618       } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
    619         setExpScore(
    620             &Inst, TII, TRI, MRI,
    621             AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
    622             CurrScore);
    623       }
    624     } else {
    625       if (TII->isEXP(Inst)) {
    626         // For export the destination registers are really temps that
    627         // can be used as the actual source after export patching, so
    628         // we need to treat them like sources and set the EXP_CNT
    629         // score.
    630         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
    631           MachineOperand &DefMO = Inst.getOperand(I);
    632           if (DefMO.isReg() && DefMO.isDef() &&
    633               TRI->isVGPR(MRIA, DefMO.getReg())) {
    634             setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
    635                         CurrScore);
    636           }
    637         }
    638       }
    639       for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
    640         MachineOperand &MO = Inst.getOperand(I);
    641         if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
    642           setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
    643         }
    644       }
    645     }
    646 #if 0 // TODO: check if this is handled by MUBUF code above.
    647   } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
    648        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
    649        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
    650     MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
    651     unsigned OpNo;//TODO: find the OpNo for this operand;
    652     RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
    653     for (signed RegNo = Interval.first; RegNo < Interval.second;
    654     ++RegNo) {
    655       setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
    656     }
    657 #endif
    658   } else {
    659     // Match the score to the destination registers.
    660     for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
    661       RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
    662       if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
    663         continue;
    664       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
    665         setRegScore(RegNo, T, CurrScore);
    666       }
    667     }
    668     if (TII->isDS(Inst) && Inst.mayStore()) {
    669       setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
    670     }
    671   }
    672 }
    673 
    674 void BlockWaitcntBrackets::print(raw_ostream &OS) {
    675   OS << '\n';
    676   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    677        T = (enum InstCounterType)(T + 1)) {
    678     int LB = getScoreLB(T);
    679     int UB = getScoreUB(T);
    680 
    681     switch (T) {
    682     case VM_CNT:
    683       OS << "    VM_CNT(" << UB - LB << "): ";
    684       break;
    685     case LGKM_CNT:
    686       OS << "    LGKM_CNT(" << UB - LB << "): ";
    687       break;
    688     case EXP_CNT:
    689       OS << "    EXP_CNT(" << UB - LB << "): ";
    690       break;
    691     default:
    692       OS << "    UNKNOWN(" << UB - LB << "): ";
    693       break;
    694     }
    695 
    696     if (LB < UB) {
    697       // Print vgpr scores.
    698       for (int J = 0; J <= getMaxVGPR(); J++) {
    699         int RegScore = getRegScore(J, T);
    700         if (RegScore <= LB)
    701           continue;
    702         int RelScore = RegScore - LB - 1;
    703         if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
    704           OS << RelScore << ":v" << J << " ";
    705         } else {
    706           OS << RelScore << ":ds ";
    707         }
    708       }
    709       // Also need to print sgpr scores for lgkm_cnt.
    710       if (T == LGKM_CNT) {
    711         for (int J = 0; J <= getMaxSGPR(); J++) {
    712           int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
    713           if (RegScore <= LB)
    714             continue;
    715           int RelScore = RegScore - LB - 1;
    716           OS << RelScore << ":s" << J << " ";
    717         }
    718       }
    719     }
    720     OS << '\n';
    721   }
    722   OS << '\n';
    723 }
    724 
    725 unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
    726                                                 int ScoreToWait) {
    727   unsigned int NeedWait = 0;
    728   if (ScoreToWait == -1) {
    729     // The score to wait is unknown. This implies that it was not encountered
    730     // during the path of the CFG walk done during the current traversal but
    731     // may be seen on a different path. Emit an s_wait counter with a
    732     // conservative value of 0 for the counter.
    733     NeedWait = CNT_MASK(T);
    734     setScoreLB(T, getScoreUB(T));
    735     return NeedWait;
    736   }
    737 
    738   // If the score of src_operand falls within the bracket, we need an
    739   // s_waitcnt instruction.
    740   const int32_t LB = getScoreLB(T);
    741   const int32_t UB = getScoreUB(T);
    742   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
    743     if ((T == VM_CNT || T == LGKM_CNT) &&
    744         hasPendingFlat() &&
    745         !ST->hasFlatLgkmVMemCountInOrder()) {
    746       // If there is a pending FLAT operation, and this is a VMem or LGKM
    747       // waitcnt and the target can report early completion, then we need
    748       // to force a waitcnt 0.
    749       NeedWait = CNT_MASK(T);
    750       setScoreLB(T, getScoreUB(T));
    751     } else if (counterOutOfOrder(T)) {
    752       // Counter can get decremented out-of-order when there
    753       // are multiple types event in the bracket. Also emit an s_wait counter
    754       // with a conservative value of 0 for the counter.
    755       NeedWait = CNT_MASK(T);
    756       setScoreLB(T, getScoreUB(T));
    757     } else {
    758       NeedWait = CNT_MASK(T);
    759       setScoreLB(T, ScoreToWait);
    760     }
    761   }
    762 
    763   return NeedWait;
    764 }
    765 
    766 // Where there are multiple types of event in the bracket of a counter,
    767 // the decrement may go out of order.
    768 bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
    769   switch (T) {
    770   case VM_CNT:
    771     return false;
    772   case LGKM_CNT: {
    773     if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
    774         EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
    775       // Scalar memory read always can go out of order.
    776       return true;
    777     }
    778     int NumEventTypes = 0;
    779     if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
    780         EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
    781       NumEventTypes++;
    782     }
    783     if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
    784         EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
    785       NumEventTypes++;
    786     }
    787     if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
    788         EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
    789       NumEventTypes++;
    790     }
    791     if (NumEventTypes <= 1) {
    792       return false;
    793     }
    794     break;
    795   }
    796   case EXP_CNT: {
    797     // If there has been a mixture of export types, then a waitcnt exp(0) is
    798     // required.
    799     if (MixedExpTypes)
    800       return true;
    801     int NumEventTypes = 0;
    802     if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
    803         EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
    804       NumEventTypes++;
    805     }
    806     if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
    807         EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
    808       NumEventTypes++;
    809     }
    810     if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
    811         EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
    812       NumEventTypes++;
    813     }
    814     if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
    815         EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
    816       NumEventTypes++;
    817     }
    818 
    819     if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
    820         EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
    821       NumEventTypes++;
    822     }
    823 
    824     if (NumEventTypes <= 1) {
    825       return false;
    826     }
    827     break;
    828   }
    829   default:
    830     break;
    831   }
    832   return true;
    833 }
    834 
    835 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
    836                       false)
    837 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
    838                     false)
    839 
    840 char SIInsertWaitcnts::ID = 0;
    841 
    842 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
    843 
    844 FunctionPass *llvm::createSIInsertWaitcntsPass() {
    845   return new SIInsertWaitcnts();
    846 }
    847 
    848 static bool readsVCCZ(const MachineInstr &MI) {
    849   unsigned Opc = MI.getOpcode();
    850   return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
    851          !MI.getOperand(1).isUndef();
    852 }
    853 
    854 /// Given wait count encodings checks if LHS is stronger than RHS.
    855 bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) {
    856   if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS))
    857     return false;
    858   if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS))
    859     return false;
    860   if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS))
    861     return false;
    862   return true;
    863 }
    864 
    865 /// Given wait count encodings create a new encoding which is stronger
    866 /// or equal to both.
    867 unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
    868   unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS),
    869                             AMDGPU::decodeVmcnt(IV, RHS));
    870   unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS),
    871                               AMDGPU::decodeLgkmcnt(IV, RHS));
    872   unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS),
    873                              AMDGPU::decodeExpcnt(IV, RHS));
    874   return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt);
    875 }
    876 
    877 ///  Generate s_waitcnt instruction to be placed before cur_Inst.
    878 ///  Instructions of a given type are returned in order,
    879 ///  but instructions of different types can complete out of order.
    880 ///  We rely on this in-order completion
    881 ///  and simply assign a score to the memory access instructions.
    882 ///  We keep track of the active "score bracket" to determine
    883 ///  if an access of a memory read requires an s_waitcnt
    884 ///  and if so what the value of each counter is.
    885 ///  The "score bracket" is bound by the lower bound and upper bound
    886 ///  scores (*_score_LB and *_score_ub respectively).
    887 void SIInsertWaitcnts::generateWaitcntInstBefore(
    888     MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
    889   // To emit, or not to emit - that's the question!
    890   // Start with an assumption that there is no need to emit.
    891   unsigned int EmitWaitcnt = 0;
    892 
    893   // No need to wait before phi. If a phi-move exists, then the wait should
    894   // has been inserted before the move. If a phi-move does not exist, then
    895   // wait should be inserted before the real use. The same is true for
    896   // sc-merge. It is not a coincident that all these cases correspond to the
    897   // instructions that are skipped in the assembling loop.
    898   bool NeedLineMapping = false; // TODO: Check on this.
    899 
    900   // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug
    901   bool ForceEmitZeroWaitcnt = false;
    902 
    903   setForceEmitWaitcnt();
    904   bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
    905 
    906   if (MI.isDebugInstr() &&
    907       // TODO: any other opcode?
    908       !NeedLineMapping) {
    909     return;
    910   }
    911 
    912   // See if an s_waitcnt is forced at block entry, or is needed at
    913   // program end.
    914   if (ScoreBrackets->getWaitAtBeginning()) {
    915     // Note that we have already cleared the state, so we don't need to update
    916     // it.
    917     ScoreBrackets->clearWaitAtBeginning();
    918     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    919          T = (enum InstCounterType)(T + 1)) {
    920       EmitWaitcnt |= CNT_MASK(T);
    921       ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
    922     }
    923   }
    924 
    925   // See if this instruction has a forced S_WAITCNT VM.
    926   // TODO: Handle other cases of NeedsWaitcntVmBefore()
    927   else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
    928            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
    929            MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
    930     EmitWaitcnt |=
    931         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
    932   }
    933 
    934   // All waits must be resolved at call return.
    935   // NOTE: this could be improved with knowledge of all call sites or
    936   //   with knowledge of the called routines.
    937   if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
    938       MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
    939     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
    940          T = (enum InstCounterType)(T + 1)) {
    941       if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
    942         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
    943         EmitWaitcnt |= CNT_MASK(T);
    944       }
    945     }
    946   }
    947   // Resolve vm waits before gs-done.
    948   else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
    949             MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
    950            ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
    951             AMDGPU::SendMsg::ID_GS_DONE)) {
    952     if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
    953       ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
    954       EmitWaitcnt |= CNT_MASK(VM_CNT);
    955     }
    956   }
    957 #if 0 // TODO: the following blocks of logic when we have fence.
    958   else if (MI.getOpcode() == SC_FENCE) {
    959     const unsigned int group_size =
    960       context->shader_info->GetMaxThreadGroupSize();
    961     // group_size == 0 means thread group size is unknown at compile time
    962     const bool group_is_multi_wave =
    963       (group_size == 0 || group_size > target_info->GetWaveFrontSize());
    964     const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
    965 
    966     for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
    967       SCRegType src_type = Inst->GetSrcType(i);
    968       switch (src_type) {
    969         case SCMEM_LDS:
    970           if (group_is_multi_wave ||
    971             context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
    972             EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
    973                                ScoreBrackets->getScoreUB(LGKM_CNT));
    974             // LDS may have to wait for VM_CNT after buffer load to LDS
    975             if (target_info->HasBufferLoadToLDS()) {
    976               EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
    977                                  ScoreBrackets->getScoreUB(VM_CNT));
    978             }
    979           }
    980           break;
    981 
    982         case SCMEM_GDS:
    983           if (group_is_multi_wave || fence_is_global) {
    984             EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
    985               ScoreBrackets->getScoreUB(EXP_CNT));
    986             EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
    987               ScoreBrackets->getScoreUB(LGKM_CNT));
    988           }
    989           break;
    990 
    991         case SCMEM_UAV:
    992         case SCMEM_TFBUF:
    993         case SCMEM_RING:
    994         case SCMEM_SCATTER:
    995           if (group_is_multi_wave || fence_is_global) {
    996             EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
    997               ScoreBrackets->getScoreUB(EXP_CNT));
    998             EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
    999               ScoreBrackets->getScoreUB(VM_CNT));
   1000           }
   1001           break;
   1002 
   1003         case SCMEM_SCRATCH:
   1004         default:
   1005           break;
   1006       }
   1007     }
   1008   }
   1009 #endif
   1010 
   1011   // Export & GDS instructions do not read the EXEC mask until after the export
   1012   // is granted (which can occur well after the instruction is issued).
   1013   // The shader program must flush all EXP operations on the export-count
   1014   // before overwriting the EXEC mask.
   1015   else {
   1016     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
   1017       // Export and GDS are tracked individually, either may trigger a waitcnt
   1018       // for EXEC.
   1019       EmitWaitcnt |= ScoreBrackets->updateByWait(
   1020           EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
   1021       EmitWaitcnt |= ScoreBrackets->updateByWait(
   1022           EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
   1023       EmitWaitcnt |= ScoreBrackets->updateByWait(
   1024           EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
   1025       EmitWaitcnt |= ScoreBrackets->updateByWait(
   1026           EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
   1027     }
   1028 
   1029 #if 0 // TODO: the following code to handle CALL.
   1030     // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
   1031     // However, there is a problem with EXP_CNT, because the call cannot
   1032     // easily tell if a register is used in the function, and if it did, then
   1033     // the referring instruction would have to have an S_WAITCNT, which is
   1034     // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
   1035     // before the call.
   1036     if (MI.getOpcode() == SC_CALL) {
   1037       if (ScoreBrackets->getScoreUB(EXP_CNT) >
   1038         ScoreBrackets->getScoreLB(EXP_CNT)) {
   1039         ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
   1040         EmitWaitcnt |= CNT_MASK(EXP_CNT);
   1041       }
   1042     }
   1043 #endif
   1044 
   1045     // FIXME: Should not be relying on memoperands.
   1046     // Look at the source operands of every instruction to see if
   1047     // any of them results from a previous memory operation that affects
   1048     // its current usage. If so, an s_waitcnt instruction needs to be
   1049     // emitted.
   1050     // If the source operand was defined by a load, add the s_waitcnt
   1051     // instruction.
   1052     for (const MachineMemOperand *Memop : MI.memoperands()) {
   1053       unsigned AS = Memop->getAddrSpace();
   1054       if (AS != AMDGPUASI.LOCAL_ADDRESS)
   1055         continue;
   1056       unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
   1057       // VM_CNT is only relevant to vgpr or LDS.
   1058       EmitWaitcnt |= ScoreBrackets->updateByWait(
   1059           VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
   1060     }
   1061 
   1062     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
   1063       const MachineOperand &Op = MI.getOperand(I);
   1064       const MachineRegisterInfo &MRIA = *MRI;
   1065       RegInterval Interval =
   1066           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
   1067       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
   1068         if (TRI->isVGPR(MRIA, Op.getReg())) {
   1069           // VM_CNT is only relevant to vgpr or LDS.
   1070           EmitWaitcnt |= ScoreBrackets->updateByWait(
   1071               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
   1072         }
   1073         EmitWaitcnt |= ScoreBrackets->updateByWait(
   1074             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
   1075       }
   1076     }
   1077     // End of for loop that looks at all source operands to decide vm_wait_cnt
   1078     // and lgk_wait_cnt.
   1079 
   1080     // Two cases are handled for destination operands:
   1081     // 1) If the destination operand was defined by a load, add the s_waitcnt
   1082     // instruction to guarantee the right WAW order.
   1083     // 2) If a destination operand that was used by a recent export/store ins,
   1084     // add s_waitcnt on exp_cnt to guarantee the WAR order.
   1085     if (MI.mayStore()) {
   1086       // FIXME: Should not be relying on memoperands.
   1087       for (const MachineMemOperand *Memop : MI.memoperands()) {
   1088         unsigned AS = Memop->getAddrSpace();
   1089         if (AS != AMDGPUASI.LOCAL_ADDRESS)
   1090           continue;
   1091         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
   1092         EmitWaitcnt |= ScoreBrackets->updateByWait(
   1093             VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
   1094         EmitWaitcnt |= ScoreBrackets->updateByWait(
   1095             EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
   1096       }
   1097     }
   1098     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
   1099       MachineOperand &Def = MI.getOperand(I);
   1100       const MachineRegisterInfo &MRIA = *MRI;
   1101       RegInterval Interval =
   1102           ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
   1103       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
   1104         if (TRI->isVGPR(MRIA, Def.getReg())) {
   1105           EmitWaitcnt |= ScoreBrackets->updateByWait(
   1106               VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
   1107           EmitWaitcnt |= ScoreBrackets->updateByWait(
   1108               EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
   1109         }
   1110         EmitWaitcnt |= ScoreBrackets->updateByWait(
   1111             LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
   1112       }
   1113     } // End of for loop that looks at all dest operands.
   1114   }
   1115 
   1116   // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
   1117   // occurs before the instruction. Doing it here prevents any additional
   1118   // S_WAITCNTs from being emitted if the instruction was marked as
   1119   // requiring a WAITCNT beforehand.
   1120   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
   1121       !ST->hasAutoWaitcntBeforeBarrier()) {
   1122     EmitWaitcnt |=
   1123         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
   1124     EmitWaitcnt |= ScoreBrackets->updateByWait(
   1125         EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
   1126     EmitWaitcnt |= ScoreBrackets->updateByWait(
   1127         LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
   1128   }
   1129 
   1130   // TODO: Remove this work-around, enable the assert for Bug 457939
   1131   //       after fixing the scheduler. Also, the Shader Compiler code is
   1132   //       independent of target.
   1133   if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
   1134     if (ScoreBrackets->getScoreLB(LGKM_CNT) <
   1135             ScoreBrackets->getScoreUB(LGKM_CNT) &&
   1136         ScoreBrackets->hasPendingSMEM()) {
   1137       // Wait on everything, not just LGKM.  vccz reads usually come from
   1138       // terminators, and we always wait on everything at the end of the
   1139       // block, so if we only wait on LGKM here, we might end up with
   1140       // another s_waitcnt inserted right after this if there are non-LGKM
   1141       // instructions still outstanding.
   1142       // FIXME: this is too conservative / the comment is wrong.
   1143       // We don't wait on everything at the end of the block and we combine
   1144       // waitcnts so we should never have back-to-back waitcnts.
   1145       ForceEmitZeroWaitcnt = true;
   1146       EmitWaitcnt = true;
   1147     }
   1148   }
   1149 
   1150   // Does this operand processing indicate s_wait counter update?
   1151   if (EmitWaitcnt || IsForceEmitWaitcnt) {
   1152     int CntVal[NUM_INST_CNTS];
   1153 
   1154     bool UseDefaultWaitcntStrategy = true;
   1155     if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) {
   1156       // Force all waitcnts to 0.
   1157       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
   1158            T = (enum InstCounterType)(T + 1)) {
   1159         ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
   1160       }
   1161       CntVal[VM_CNT] = 0;
   1162       CntVal[EXP_CNT] = 0;
   1163       CntVal[LGKM_CNT] = 0;
   1164       UseDefaultWaitcntStrategy = false;
   1165     }
   1166 
   1167     if (UseDefaultWaitcntStrategy) {
   1168       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
   1169            T = (enum InstCounterType)(T + 1)) {
   1170         if (EmitWaitcnt & CNT_MASK(T)) {
   1171           int Delta =
   1172               ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
   1173           int MaxDelta = ScoreBrackets->getWaitCountMax(T);
   1174           if (Delta >= MaxDelta) {
   1175             Delta = -1;
   1176             if (T != EXP_CNT) {
   1177               ScoreBrackets->setScoreLB(
   1178                   T, ScoreBrackets->getScoreUB(T) - MaxDelta);
   1179             }
   1180             EmitWaitcnt &= ~CNT_MASK(T);
   1181           }
   1182           CntVal[T] = Delta;
   1183         } else {
   1184           // If we are not waiting for a particular counter then encode
   1185           // it as -1 which means "don't care."
   1186           CntVal[T] = -1;
   1187         }
   1188       }
   1189     }
   1190 
   1191     // If we are not waiting on any counter we can skip the wait altogether.
   1192     if (EmitWaitcnt != 0 || IsForceEmitWaitcnt) {
   1193       MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
   1194       int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
   1195       if (!OldWaitcnt ||
   1196           (AMDGPU::decodeVmcnt(IV, Imm) !=
   1197                           (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
   1198           (AMDGPU::decodeExpcnt(IV, Imm) !=
   1199            (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
   1200           (AMDGPU::decodeLgkmcnt(IV, Imm) !=
   1201            (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
   1202         MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
   1203         if (ContainingLoop) {
   1204           MachineBasicBlock *TBB = ContainingLoop->getHeader();
   1205           BlockWaitcntBrackets *ScoreBracket =
   1206               BlockWaitcntBracketsMap[TBB].get();
   1207           if (!ScoreBracket) {
   1208             assert(!BlockVisitedSet.count(TBB));
   1209             BlockWaitcntBracketsMap[TBB] =
   1210                 llvm::make_unique<BlockWaitcntBrackets>(ST);
   1211             ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
   1212           }
   1213           ScoreBracket->setRevisitLoop(true);
   1214           LLVM_DEBUG(dbgs()
   1215                          << "set-revisit2: Block"
   1216                          << ContainingLoop->getHeader()->getNumber() << '\n';);
   1217         }
   1218       }
   1219 
   1220       // Update an existing waitcount, or make a new one.
   1221       unsigned Enc = AMDGPU::encodeWaitcnt(IV,
   1222                       ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT],
   1223                       ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT],
   1224                       ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]);
   1225       // We don't remove waitcnts that existed prior to the waitcnt
   1226       // pass. Check if the waitcnt to-be-inserted can be avoided
   1227       // or if the prev waitcnt can be updated.
   1228       bool insertSWaitInst = true;
   1229       for (MachineBasicBlock::iterator I = MI.getIterator(),
   1230                                        B = MI.getParent()->begin();
   1231            insertSWaitInst && I != B; --I) {
   1232         if (I == MI.getIterator())
   1233           continue;
   1234 
   1235         switch (I->getOpcode()) {
   1236         case AMDGPU::S_WAITCNT:
   1237           if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
   1238             insertSWaitInst = false;
   1239           else if (!OldWaitcnt) {
   1240             OldWaitcnt = &*I;
   1241             Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
   1242           }
   1243           break;
   1244         // TODO: skip over instructions which never require wait.
   1245         }
   1246         break;
   1247       }
   1248       if (insertSWaitInst) {
   1249         if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) {
   1250           if (ForceEmitZeroWaitcnts)
   1251             LLVM_DEBUG(
   1252                 dbgs()
   1253                 << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
   1254           if (IsForceEmitWaitcnt)
   1255             LLVM_DEBUG(dbgs()
   1256                        << "Force emit a s_waitcnt due to debug counter\n");
   1257 
   1258           OldWaitcnt->getOperand(0).setImm(Enc);
   1259           if (!OldWaitcnt->getParent())
   1260             MI.getParent()->insert(MI, OldWaitcnt);
   1261 
   1262           LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
   1263                             << "Old Instr: " << MI << '\n'
   1264                             << "New Instr: " << *OldWaitcnt << '\n');
   1265         } else {
   1266             auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
   1267                                MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
   1268                              .addImm(Enc);
   1269             TrackedWaitcntSet.insert(SWaitInst);
   1270 
   1271             LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
   1272                               << "Old Instr: " << MI << '\n'
   1273                               << "New Instr: " << *SWaitInst << '\n');
   1274         }
   1275       }
   1276 
   1277       if (CntVal[EXP_CNT] == 0) {
   1278         ScoreBrackets->setMixedExpTypes(false);
   1279       }
   1280     }
   1281   }
   1282 }
   1283 
   1284 void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
   1285                                              MachineInstr *Waitcnt) {
   1286   if (MBB.empty()) {
   1287     MBB.push_back(Waitcnt);
   1288     return;
   1289   }
   1290 
   1291   MachineBasicBlock::iterator It = MBB.end();
   1292   MachineInstr *MI = &*(--It);
   1293   if (MI->isBranch()) {
   1294     MBB.insert(It, Waitcnt);
   1295   } else {
   1296     MBB.push_back(Waitcnt);
   1297   }
   1298 }
   1299 
   1300 // This is a flat memory operation. Check to see if it has memory
   1301 // tokens for both LDS and Memory, and if so mark it as a flat.
   1302 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
   1303   if (MI.memoperands_empty())
   1304     return true;
   1305 
   1306   for (const MachineMemOperand *Memop : MI.memoperands()) {
   1307     unsigned AS = Memop->getAddrSpace();
   1308     if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
   1309       return true;
   1310   }
   1311 
   1312   return false;
   1313 }
   1314 
   1315 void SIInsertWaitcnts::updateEventWaitcntAfter(
   1316     MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
   1317   // Now look at the instruction opcode. If it is a memory access
   1318   // instruction, update the upper-bound of the appropriate counter's
   1319   // bracket and the destination operand scores.
   1320   // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
   1321   if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
   1322     if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
   1323       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
   1324       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
   1325     } else {
   1326       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
   1327     }
   1328   } else if (TII->isFLAT(Inst)) {
   1329     assert(Inst.mayLoad() || Inst.mayStore());
   1330 
   1331     if (TII->usesVM_CNT(Inst))
   1332       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
   1333 
   1334     if (TII->usesLGKM_CNT(Inst)) {
   1335       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
   1336 
   1337       // This is a flat memory operation, so note it - it will require
   1338       // that both the VM and LGKM be flushed to zero if it is pending when
   1339       // a VM or LGKM dependency occurs.
   1340       if (mayAccessLDSThroughFlat(Inst))
   1341         ScoreBrackets->setPendingFlat();
   1342     }
   1343   } else if (SIInstrInfo::isVMEM(Inst) &&
   1344              // TODO: get a better carve out.
   1345              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
   1346              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
   1347              Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
   1348     ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
   1349     if (ST->vmemWriteNeedsExpWaitcnt() &&
   1350         (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
   1351       ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
   1352     }
   1353   } else if (TII->isSMRD(Inst)) {
   1354     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
   1355   } else {
   1356     switch (Inst.getOpcode()) {
   1357     case AMDGPU::S_SENDMSG:
   1358     case AMDGPU::S_SENDMSGHALT:
   1359       ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
   1360       break;
   1361     case AMDGPU::EXP:
   1362     case AMDGPU::EXP_DONE: {
   1363       int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
   1364       if (Imm >= 32 && Imm <= 63)
   1365         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
   1366       else if (Imm >= 12 && Imm <= 15)
   1367         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
   1368       else
   1369         ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
   1370       break;
   1371     }
   1372     case AMDGPU::S_MEMTIME:
   1373     case AMDGPU::S_MEMREALTIME:
   1374       ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
   1375       break;
   1376     default:
   1377       break;
   1378     }
   1379   }
   1380 }
   1381 
   1382 // Merge the score brackets of the Block's predecessors;
   1383 // this merged score bracket is used when adding waitcnts to the Block
   1384 void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
   1385   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
   1386   int32_t MaxPending[NUM_INST_CNTS] = {0};
   1387   int32_t MaxFlat[NUM_INST_CNTS] = {0};
   1388   bool MixedExpTypes = false;
   1389 
   1390   // For single basic block loops, we need to retain the Block's
   1391   // score bracket to have accurate Pred info. So, make a copy of Block's
   1392   // score bracket, clear() it (which retains several important bits of info),
   1393   // populate, and then replace en masse. For non-single basic block loops,
   1394   // just clear Block's current score bracket and repopulate in-place.
   1395   bool IsSelfPred;
   1396   std::unique_ptr<BlockWaitcntBrackets> S;
   1397 
   1398   IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
   1399     != Block.pred_end();
   1400   if (IsSelfPred) {
   1401     S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
   1402     ScoreBrackets = S.get();
   1403   }
   1404 
   1405   ScoreBrackets->clear();
   1406 
   1407   // See if there are any uninitialized predecessors. If so, emit an
   1408   // s_waitcnt 0 at the beginning of the block.
   1409   for (MachineBasicBlock *Pred : Block.predecessors()) {
   1410     BlockWaitcntBrackets *PredScoreBrackets =
   1411         BlockWaitcntBracketsMap[Pred].get();
   1412     bool Visited = BlockVisitedSet.count(Pred);
   1413     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
   1414       continue;
   1415     }
   1416     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
   1417          T = (enum InstCounterType)(T + 1)) {
   1418       int span =
   1419           PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
   1420       MaxPending[T] = std::max(MaxPending[T], span);
   1421       span =
   1422           PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
   1423       MaxFlat[T] = std::max(MaxFlat[T], span);
   1424     }
   1425 
   1426     MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
   1427   }
   1428 
   1429   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
   1430   // Also handle kills for exit block.
   1431   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
   1432     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
   1433       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
   1434            T = (enum InstCounterType)(T + 1)) {
   1435         int Span = KillWaitBrackets[I]->getScoreUB(T) -
   1436                    KillWaitBrackets[I]->getScoreLB(T);
   1437         MaxPending[T] = std::max(MaxPending[T], Span);
   1438         Span = KillWaitBrackets[I]->pendingFlat(T) -
   1439                KillWaitBrackets[I]->getScoreLB(T);
   1440         MaxFlat[T] = std::max(MaxFlat[T], Span);
   1441       }
   1442 
   1443       MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
   1444     }
   1445   }
   1446 
   1447   // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
   1448   for (MachineBasicBlock *Pred : Block.predecessors()) {
   1449     BlockWaitcntBrackets *PredScoreBrackets =
   1450         BlockWaitcntBracketsMap[Pred].get();
   1451     bool Visited = BlockVisitedSet.count(Pred);
   1452     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
   1453       continue;
   1454     }
   1455 
   1456     int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
   1457                   PredScoreBrackets->getScoreLB(EXP_CNT);
   1458     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
   1459     int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
   1460                   PredScoreBrackets->getScoreLB(EXP_CNT);
   1461     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
   1462   }
   1463 
   1464   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
   1465   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
   1466     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
   1467       int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
   1468                     KillWaitBrackets[I]->getScoreLB(EXP_CNT);
   1469       MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
   1470       int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
   1471                     KillWaitBrackets[I]->getScoreLB(EXP_CNT);
   1472       MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
   1473     }
   1474   }
   1475 
   1476 #if 0
   1477   // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
   1478   // TODO: how does LC distinguish between function entry and main entry?
   1479   // If this is the entry to a function, force a wait.
   1480   MachineBasicBlock &Entry = Block.getParent()->front();
   1481   if (Entry.getNumber() == Block.getNumber()) {
   1482     ScoreBrackets->setWaitAtBeginning();
   1483     return;
   1484   }
   1485 #endif
   1486 
   1487   // Now set the current Block's brackets to the largest ending bracket.
   1488   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
   1489        T = (enum InstCounterType)(T + 1)) {
   1490     ScoreBrackets->setScoreUB(T, MaxPending[T]);
   1491     ScoreBrackets->setScoreLB(T, 0);
   1492     ScoreBrackets->setLastFlat(T, MaxFlat[T]);
   1493   }
   1494 
   1495   ScoreBrackets->setMixedExpTypes(MixedExpTypes);
   1496 
   1497   // Set the register scoreboard.
   1498   for (MachineBasicBlock *Pred : Block.predecessors()) {
   1499     if (!BlockVisitedSet.count(Pred)) {
   1500       continue;
   1501     }
   1502 
   1503     BlockWaitcntBrackets *PredScoreBrackets =
   1504         BlockWaitcntBracketsMap[Pred].get();
   1505 
   1506     // Now merge the gpr_reg_score information
   1507     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
   1508          T = (enum InstCounterType)(T + 1)) {
   1509       int PredLB = PredScoreBrackets->getScoreLB(T);
   1510       int PredUB = PredScoreBrackets->getScoreUB(T);
   1511       if (PredLB < PredUB) {
   1512         int PredScale = MaxPending[T] - PredUB;
   1513         // Merge vgpr scores.
   1514         for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
   1515           int PredRegScore = PredScoreBrackets->getRegScore(J, T);
   1516           if (PredRegScore <= PredLB)
   1517             continue;
   1518           int NewRegScore = PredScale + PredRegScore;
   1519           ScoreBrackets->setRegScore(
   1520               J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
   1521         }
   1522         // Also need to merge sgpr scores for lgkm_cnt.
   1523         if (T == LGKM_CNT) {
   1524           for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
   1525             int PredRegScore =
   1526                 PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
   1527             if (PredRegScore <= PredLB)
   1528               continue;
   1529             int NewRegScore = PredScale + PredRegScore;
   1530             ScoreBrackets->setRegScore(
   1531                 J + NUM_ALL_VGPRS, LGKM_CNT,
   1532                 std::max(
   1533                     ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
   1534                     NewRegScore));
   1535           }
   1536         }
   1537       }
   1538     }
   1539 
   1540     // Also merge the WaitEvent information.
   1541     ForAllWaitEventType(W) {
   1542       enum InstCounterType T = PredScoreBrackets->eventCounter(W);
   1543       int PredEventUB = PredScoreBrackets->getEventUB(W);
   1544       if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
   1545         int NewEventUB =
   1546             MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
   1547         if (NewEventUB > 0) {
   1548           ScoreBrackets->setEventUB(
   1549               W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
   1550         }
   1551       }
   1552     }
   1553   }
   1554 
   1555   // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
   1556   // Set the register scoreboard.
   1557   if (Block.succ_empty() && !KillWaitBrackets.empty()) {
   1558     for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
   1559       // Now merge the gpr_reg_score information.
   1560       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
   1561            T = (enum InstCounterType)(T + 1)) {
   1562         int PredLB = KillWaitBrackets[I]->getScoreLB(T);
   1563         int PredUB = KillWaitBrackets[I]->getScoreUB(T);
   1564         if (PredLB < PredUB) {
   1565           int PredScale = MaxPending[T] - PredUB;
   1566           // Merge vgpr scores.
   1567           for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
   1568             int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
   1569             if (PredRegScore <= PredLB)
   1570               continue;
   1571             int NewRegScore = PredScale + PredRegScore;
   1572             ScoreBrackets->setRegScore(
   1573                 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
   1574           }
   1575           // Also need to merge sgpr scores for lgkm_cnt.
   1576           if (T == LGKM_CNT) {
   1577             for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
   1578               int PredRegScore =
   1579                   KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
   1580               if (PredRegScore <= PredLB)
   1581                 continue;
   1582               int NewRegScore = PredScale + PredRegScore;
   1583               ScoreBrackets->setRegScore(
   1584                   J + NUM_ALL_VGPRS, LGKM_CNT,
   1585                   std::max(
   1586                       ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
   1587                       NewRegScore));
   1588             }
   1589           }
   1590         }
   1591       }
   1592 
   1593       // Also merge the WaitEvent information.
   1594       ForAllWaitEventType(W) {
   1595         enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
   1596         int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
   1597         if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
   1598           int NewEventUB =
   1599               MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
   1600           if (NewEventUB > 0) {
   1601             ScoreBrackets->setEventUB(
   1602                 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
   1603           }
   1604         }
   1605       }
   1606     }
   1607   }
   1608 
   1609   // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
   1610   // sequencing predecessors, because changes to EXEC require waitcnts due to
   1611   // the delayed nature of these operations.
   1612   for (MachineBasicBlock *Pred : Block.predecessors()) {
   1613     if (!BlockVisitedSet.count(Pred)) {
   1614       continue;
   1615     }
   1616 
   1617     BlockWaitcntBrackets *PredScoreBrackets =
   1618         BlockWaitcntBracketsMap[Pred].get();
   1619 
   1620     int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
   1621     if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
   1622       int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
   1623                        PredScoreBrackets->getScoreUB(EXP_CNT);
   1624       if (new_gds_ub > 0) {
   1625         ScoreBrackets->setEventUB(
   1626             GDS_GPR_LOCK,
   1627             std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
   1628       }
   1629     }
   1630     int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
   1631     if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
   1632       int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
   1633                        PredScoreBrackets->getScoreUB(EXP_CNT);
   1634       if (new_exp_ub > 0) {
   1635         ScoreBrackets->setEventUB(
   1636             EXP_GPR_LOCK,
   1637             std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
   1638       }
   1639     }
   1640   }
   1641 
   1642   // if a single block loop, update the score brackets. Not needed for other
   1643   // blocks, as we did this in-place
   1644   if (IsSelfPred) {
   1645     BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
   1646   }
   1647 }
   1648 
   1649 /// Return true if the given basic block is a "bottom" block of a loop.
   1650 /// This works even if the loop is discontiguous. This also handles
   1651 /// multiple back-edges for the same "header" block of a loop.
   1652 bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
   1653                                     const MachineBasicBlock *Block) {
   1654   for (MachineBasicBlock *MBB : Loop->blocks()) {
   1655     if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
   1656       return true;
   1657     }
   1658   }
   1659   return false;
   1660 }
   1661 
   1662 /// Count the number of "bottom" basic blocks of a loop.
   1663 unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
   1664   unsigned Count = 0;
   1665   for (MachineBasicBlock *MBB : Loop->blocks()) {
   1666     if (MBB->isSuccessor(Loop->getHeader())) {
   1667       Count++;
   1668     }
   1669   }
   1670   return Count;
   1671 }
   1672 
   1673 // Generate s_waitcnt instructions where needed.
   1674 void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
   1675                                             MachineBasicBlock &Block) {
   1676   // Initialize the state information.
   1677   mergeInputScoreBrackets(Block);
   1678 
   1679   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
   1680 
   1681   LLVM_DEBUG({
   1682     dbgs() << "*** Block" << Block.getNumber() << " ***";
   1683     ScoreBrackets->dump();
   1684   });
   1685 
   1686   // Walk over the instructions.
   1687   for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
   1688        Iter != E;) {
   1689     MachineInstr &Inst = *Iter;
   1690     // Remove any previously existing waitcnts.
   1691     if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
   1692       // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
   1693       // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
   1694       // as needed.
   1695       if (!TrackedWaitcntSet.count(&Inst))
   1696         ++Iter;
   1697       else {
   1698         ++Iter;
   1699         Inst.removeFromParent();
   1700       }
   1701       ScoreBrackets->setWaitcnt(&Inst);
   1702       continue;
   1703     }
   1704 
   1705     // Kill instructions generate a conditional branch to the endmain block.
   1706     // Merge the current waitcnt state into the endmain block information.
   1707     // TODO: Are there other flavors of KILL instruction?
   1708     if (Inst.getOpcode() == AMDGPU::KILL) {
   1709       addKillWaitBracket(ScoreBrackets);
   1710     }
   1711 
   1712     bool VCCZBugWorkAround = false;
   1713     if (readsVCCZ(Inst) &&
   1714         (!VCCZBugHandledSet.count(&Inst))) {
   1715       if (ScoreBrackets->getScoreLB(LGKM_CNT) <
   1716               ScoreBrackets->getScoreUB(LGKM_CNT) &&
   1717           ScoreBrackets->hasPendingSMEM()) {
   1718         if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
   1719           VCCZBugWorkAround = true;
   1720       }
   1721     }
   1722 
   1723     // Generate an s_waitcnt instruction to be placed before
   1724     // cur_Inst, if needed.
   1725     generateWaitcntInstBefore(Inst, ScoreBrackets);
   1726 
   1727     updateEventWaitcntAfter(Inst, ScoreBrackets);
   1728 
   1729 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
   1730     // If this instruction generates a S_SETVSKIP because it is an
   1731     // indexed resource, and we are on Tahiti, then it will also force
   1732     // an S_WAITCNT vmcnt(0)
   1733     if (RequireCheckResourceType(Inst, context)) {
   1734       // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
   1735       ScoreBrackets->setScoreLB(VM_CNT,
   1736       ScoreBrackets->getScoreUB(VM_CNT));
   1737     }
   1738 #endif
   1739 
   1740     ScoreBrackets->clearWaitcnt();
   1741 
   1742     LLVM_DEBUG({
   1743       Inst.print(dbgs());
   1744       ScoreBrackets->dump();
   1745     });
   1746 
   1747     // Check to see if this is a GWS instruction. If so, and if this is CI or
   1748     // VI, then the generated code sequence will include an S_WAITCNT 0.
   1749     // TODO: Are these the only GWS instructions?
   1750     if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
   1751         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
   1752         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
   1753         Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
   1754         Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
   1755       // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
   1756       ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
   1757       ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
   1758       ScoreBrackets->updateByWait(LGKM_CNT,
   1759                                   ScoreBrackets->getScoreUB(LGKM_CNT));
   1760     }
   1761 
   1762     // TODO: Remove this work-around after fixing the scheduler and enable the
   1763     // assert above.
   1764     if (VCCZBugWorkAround) {
   1765       // Restore the vccz bit.  Any time a value is written to vcc, the vcc
   1766       // bit is updated, so we can restore the bit by reading the value of
   1767       // vcc and then writing it back to the register.
   1768       BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
   1769               AMDGPU::VCC)
   1770           .addReg(AMDGPU::VCC);
   1771       VCCZBugHandledSet.insert(&Inst);
   1772     }
   1773 
   1774     ++Iter;
   1775   }
   1776 
   1777   // Check if we need to force convergence at loop footer.
   1778   MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
   1779   if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
   1780     LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
   1781     WaitcntData->print();
   1782     LLVM_DEBUG(dbgs() << '\n';);
   1783 
   1784     // The iterative waitcnt insertion algorithm aims for optimal waitcnt
   1785     // placement, but doesn't guarantee convergence for a loop. Each
   1786     // loop should take at most (n+1) iterations for it to converge naturally,
   1787     // where n is the number of bottom blocks. If this threshold is reached and
   1788     // the result hasn't converged, then we force convergence by inserting
   1789     // a s_waitcnt at the end of loop footer.
   1790     if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
   1791       // To ensure convergence, need to make wait events at loop footer be no
   1792       // more than those from the previous iteration.
   1793       // As a simplification, instead of tracking individual scores and
   1794       // generating the precise wait count, just wait on 0.
   1795       bool HasPending = false;
   1796       MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
   1797       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
   1798            T = (enum InstCounterType)(T + 1)) {
   1799         if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
   1800           ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
   1801           HasPending = true;
   1802           break;
   1803         }
   1804       }
   1805 
   1806       if (HasPending) {
   1807         if (!SWaitInst) {
   1808           SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
   1809                               DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
   1810                               .addImm(0);
   1811           TrackedWaitcntSet.insert(SWaitInst);
   1812 #if 0 // TODO: Format the debug output
   1813           OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
   1814           OutputTransformAdd(SWaitInst, context);
   1815 #endif
   1816         }
   1817 #if 0 // TODO: ??
   1818         _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
   1819 #endif
   1820       }
   1821 
   1822       if (SWaitInst) {
   1823         LLVM_DEBUG({
   1824           SWaitInst->print(dbgs());
   1825           dbgs() << "\nAdjusted score board:";
   1826           ScoreBrackets->dump();
   1827         });
   1828 
   1829         // Add this waitcnt to the block. It is either newly created or
   1830         // created in previous iterations and added back since block traversal
   1831         // always removes waitcnts.
   1832         insertWaitcntBeforeCF(Block, SWaitInst);
   1833         WaitcntData->setWaitcnt(SWaitInst);
   1834       }
   1835     }
   1836   }
   1837 }
   1838 
   1839 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   1840   ST = &MF.getSubtarget<GCNSubtarget>();
   1841   TII = ST->getInstrInfo();
   1842   TRI = &TII->getRegisterInfo();
   1843   MRI = &MF.getRegInfo();
   1844   MLI = &getAnalysis<MachineLoopInfo>();
   1845   IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
   1846   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   1847   AMDGPUASI = ST->getAMDGPUAS();
   1848 
   1849   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
   1850   for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
   1851        T = (enum InstCounterType)(T + 1))
   1852     ForceEmitWaitcnt[T] = false;
   1853 
   1854   HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
   1855   HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
   1856   HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
   1857 
   1858   HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
   1859   HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
   1860   assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
   1861   assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
   1862 
   1863   RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
   1864   RegisterEncoding.VGPRL =
   1865       RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
   1866   RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
   1867   RegisterEncoding.SGPRL =
   1868       RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
   1869 
   1870   TrackedWaitcntSet.clear();
   1871   BlockVisitedSet.clear();
   1872   VCCZBugHandledSet.clear();
   1873   LoopWaitcntDataMap.clear();
   1874   BlockWaitcntProcessedSet.clear();
   1875 
   1876   // Walk over the blocks in reverse post-dominator order, inserting
   1877   // s_waitcnt where needed.
   1878   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
   1879   bool Modified = false;
   1880   for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
   1881            I = RPOT.begin(),
   1882            E = RPOT.end(), J = RPOT.begin();
   1883        I != E;) {
   1884     MachineBasicBlock &MBB = **I;
   1885 
   1886     BlockVisitedSet.insert(&MBB);
   1887 
   1888     BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
   1889     if (!ScoreBrackets) {
   1890       BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
   1891       ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
   1892     }
   1893     ScoreBrackets->setPostOrder(MBB.getNumber());
   1894     MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
   1895     if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
   1896       LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
   1897 
   1898     // If we are walking into the block from before the loop, then guarantee
   1899     // at least 1 re-walk over the loop to propagate the information, even if
   1900     // no S_WAITCNT instructions were generated.
   1901     if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
   1902       unsigned Count = countNumBottomBlocks(ContainingLoop);
   1903 
   1904       // If the loop has multiple back-edges, and so more than one "bottom"
   1905       // basic block, we have to guarantee a re-walk over every blocks.
   1906       if ((std::count(BlockWaitcntProcessedSet.begin(),
   1907                       BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) {
   1908         BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
   1909         LLVM_DEBUG(dbgs() << "set-revisit1: Block"
   1910                           << ContainingLoop->getHeader()->getNumber() << '\n';);
   1911       }
   1912     }
   1913 
   1914     // Walk over the instructions.
   1915     insertWaitcntInBlock(MF, MBB);
   1916 
   1917     // Record that waitcnts have been processed at least once for this block.
   1918     BlockWaitcntProcessedSet.push_back(&MBB);
   1919 
   1920     // See if we want to revisit the loop. If a loop has multiple back-edges,
   1921     // we shouldn't revisit the same "bottom" basic block.
   1922     if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
   1923         std::count(BlockWaitcntProcessedSet.begin(),
   1924                    BlockWaitcntProcessedSet.end(), &MBB) == 1) {
   1925       MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
   1926       BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
   1927       if (EntrySB && EntrySB->getRevisitLoop()) {
   1928         EntrySB->setRevisitLoop(false);
   1929         J = I;
   1930         int32_t PostOrder = EntrySB->getPostOrder();
   1931         // TODO: Avoid this loop. Find another way to set I.
   1932         for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
   1933                  X = RPOT.begin(),
   1934                  Y = RPOT.end();
   1935              X != Y; ++X) {
   1936           MachineBasicBlock &MBBX = **X;
   1937           if (MBBX.getNumber() == PostOrder) {
   1938             I = X;
   1939             break;
   1940           }
   1941         }
   1942         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
   1943         WaitcntData->incIterCnt();
   1944         LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
   1945         continue;
   1946       } else {
   1947         LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
   1948         // Loop converged, reset iteration count. If this loop gets revisited,
   1949         // it must be from an outer loop, the counter will restart, this will
   1950         // ensure we don't force convergence on such revisits.
   1951         WaitcntData->resetIterCnt();
   1952       }
   1953     }
   1954 
   1955     J = I;
   1956     ++I;
   1957   }
   1958 
   1959   SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
   1960 
   1961   bool HaveScalarStores = false;
   1962 
   1963   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
   1964        ++BI) {
   1965     MachineBasicBlock &MBB = *BI;
   1966 
   1967     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
   1968          ++I) {
   1969       if (!HaveScalarStores && TII->isScalarStore(*I))
   1970         HaveScalarStores = true;
   1971 
   1972       if (I->getOpcode() == AMDGPU::S_ENDPGM ||
   1973           I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
   1974         EndPgmBlocks.push_back(&MBB);
   1975     }
   1976   }
   1977 
   1978   if (HaveScalarStores) {
   1979     // If scalar writes are used, the cache must be flushed or else the next
   1980     // wave to reuse the same scratch memory can be clobbered.
   1981     //
   1982     // Insert s_dcache_wb at wave termination points if there were any scalar
   1983     // stores, and only if the cache hasn't already been flushed. This could be
   1984     // improved by looking across blocks for flushes in postdominating blocks
   1985     // from the stores but an explicitly requested flush is probably very rare.
   1986     for (MachineBasicBlock *MBB : EndPgmBlocks) {
   1987       bool SeenDCacheWB = false;
   1988 
   1989       for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
   1990            ++I) {
   1991         if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
   1992           SeenDCacheWB = true;
   1993         else if (TII->isScalarStore(*I))
   1994           SeenDCacheWB = false;
   1995 
   1996         // FIXME: It would be better to insert this before a waitcnt if any.
   1997         if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
   1998              I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
   1999             !SeenDCacheWB) {
   2000           Modified = true;
   2001           BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
   2002         }
   2003       }
   2004     }
   2005   }
   2006 
   2007   if (!MFI->isEntryFunction()) {
   2008     // Wait for any outstanding memory operations that the input registers may
   2009     // depend on. We can't track them and it's better to the wait after the
   2010     // costly call sequence.
   2011 
   2012     // TODO: Could insert earlier and schedule more liberally with operations
   2013     // that only use caller preserved registers.
   2014     MachineBasicBlock &EntryBB = MF.front();
   2015     BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
   2016       .addImm(0);
   2017 
   2018     Modified = true;
   2019   }
   2020 
   2021   return Modified;
   2022 }
   2023