Home | History | Annotate | Download | only in AMDGPU
      1 //===-- SIMachineScheduler.cpp - SI Scheduler Interface -*- C++ -*-----===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 /// \brief SI Machine Scheduler interface
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "AMDGPU.h"
     16 #include "SIMachineScheduler.h"
     17 #include "llvm/CodeGen/LiveInterval.h"
     18 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
     19 #include "llvm/CodeGen/MachineRegisterInfo.h"
     20 #include "llvm/CodeGen/MachineScheduler.h"
     21 #include "llvm/CodeGen/RegisterPressure.h"
     22 
     23 using namespace llvm;
     24 
     25 #define DEBUG_TYPE "misched"
     26 
     27 // This scheduler implements a different scheduling algorithm than
     28 // GenericScheduler.
     29 //
     30 // There are several specific architecture behaviours that can't be modelled
     31 // for GenericScheduler:
     32 // . When accessing the result of an SGPR load instruction, you have to wait
     33 // for all the SGPR load instructions before your current instruction to
     34 // have finished.
     35 // . When accessing the result of an VGPR load instruction, you have to wait
     36 // for all the VGPR load instructions previous to the VGPR load instruction
     37 // you are interested in to finish.
     38 // . The less the register pressure, the best load latencies are hidden
     39 //
     40 // Moreover some specifities (like the fact a lot of instructions in the shader
     41 // have few dependencies) makes the generic scheduler have some unpredictable
     42 // behaviours. For example when register pressure becomes high, it can either
     43 // manage to prevent register pressure from going too high, or it can
     44 // increase register pressure even more than if it hadn't taken register
     45 // pressure into account.
     46 //
     47 // Also some other bad behaviours are generated, like loading at the beginning
     48 // of the shader a constant in VGPR you won't need until the end of the shader.
     49 //
     50 // The scheduling problem for SI can distinguish three main parts:
     51 // . Hiding high latencies (texture sampling, etc)
     52 // . Hiding low latencies (SGPR constant loading, etc)
     53 // . Keeping register usage low for better latency hiding and general
     54 //   performance
     55 //
     56 // Some other things can also affect performance, but are hard to predict
     57 // (cache usage, the fact the HW can issue several instructions from different
     58 // wavefronts if different types, etc)
     59 //
     60 // This scheduler tries to solve the scheduling problem by dividing it into
     61 // simpler sub-problems. It divides the instructions into blocks, schedules
     62 // locally inside the blocks where it takes care of low latencies, and then
     63 // chooses the order of the blocks by taking care of high latencies.
     64 // Dividing the instructions into blocks helps control keeping register
     65 // usage low.
     66 //
     67 // First the instructions are put into blocks.
     68 //   We want the blocks help control register usage and hide high latencies
     69 //   later. To help control register usage, we typically want all local
     70 //   computations, when for example you create a result that can be comsummed
     71 //   right away, to be contained in a block. Block inputs and outputs would
     72 //   typically be important results that are needed in several locations of
     73 //   the shader. Since we do want blocks to help hide high latencies, we want
     74 //   the instructions inside the block to have a minimal set of dependencies
     75 //   on high latencies. It will make it easy to pick blocks to hide specific
     76 //   high latencies.
     77 //   The block creation algorithm is divided into several steps, and several
     78 //   variants can be tried during the scheduling process.
     79 //
     80 // Second the order of the instructions inside the blocks is choosen.
     81 //   At that step we do take into account only register usage and hiding
     82 //   low latency instructions
     83 //
     84 // Third the block order is choosen, there we try to hide high latencies
     85 // and keep register usage low.
     86 //
     87 // After the third step, a pass is done to improve the hiding of low
     88 // latencies.
     89 //
     90 // Actually when talking about 'low latency' or 'high latency' it includes
     91 // both the latency to get the cache (or global mem) data go to the register,
     92 // and the bandwith limitations.
     93 // Increasing the number of active wavefronts helps hide the former, but it
     94 // doesn't solve the latter, thus why even if wavefront count is high, we have
     95 // to try have as many instructions hiding high latencies as possible.
     96 // The OpenCL doc says for example latency of 400 cycles for a global mem access,
     97 // which is hidden by 10 instructions if the wavefront count is 10.
     98 
     99 // Some figures taken from AMD docs:
    100 // Both texture and constant L1 caches are 4-way associative with 64 bytes
    101 // lines.
    102 // Constant cache is shared with 4 CUs.
    103 // For texture sampling, the address generation unit receives 4 texture
    104 // addresses per cycle, thus we could expect texture sampling latency to be
    105 // equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
    106 // instructions in a wavefront group are executed every 4 cycles),
    107 // or 16 instructions if the other wavefronts associated to the 3 other VALUs
    108 // of the CU do texture sampling too. (Don't take these figures too seriously,
    109 // as I'm not 100% sure of the computation)
    110 // Data exports should get similar latency.
    111 // For constant loading, the cache is shader with 4 CUs.
    112 // The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
    113 // I guess if the other CU don't read the cache, it can go up to 64B/cycle.
    114 // It means a simple s_buffer_load should take one instruction to hide, as
    115 // well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
    116 // cache line.
    117 //
    118 // As of today the driver doesn't preload the constants in cache, thus the
    119 // first loads get extra latency. The doc says global memory access can be
    120 // 300-600 cycles. We do not specially take that into account when scheduling
    121 // As we expect the driver to be able to preload the constants soon.
    122 
    123 
    124 // common code //
    125 
    126 #ifndef NDEBUG
    127 
    128 static const char *getReasonStr(SIScheduleCandReason Reason) {
    129   switch (Reason) {
    130   case NoCand:         return "NOCAND";
    131   case RegUsage:       return "REGUSAGE";
    132   case Latency:        return "LATENCY";
    133   case Successor:      return "SUCCESSOR";
    134   case Depth:          return "DEPTH";
    135   case NodeOrder:      return "ORDER";
    136   }
    137   llvm_unreachable("Unknown reason!");
    138 }
    139 
    140 #endif
    141 
    142 static bool tryLess(int TryVal, int CandVal,
    143                     SISchedulerCandidate &TryCand,
    144                     SISchedulerCandidate &Cand,
    145                     SIScheduleCandReason Reason) {
    146   if (TryVal < CandVal) {
    147     TryCand.Reason = Reason;
    148     return true;
    149   }
    150   if (TryVal > CandVal) {
    151     if (Cand.Reason > Reason)
    152       Cand.Reason = Reason;
    153     return true;
    154   }
    155   Cand.setRepeat(Reason);
    156   return false;
    157 }
    158 
    159 static bool tryGreater(int TryVal, int CandVal,
    160                        SISchedulerCandidate &TryCand,
    161                        SISchedulerCandidate &Cand,
    162                        SIScheduleCandReason Reason) {
    163   if (TryVal > CandVal) {
    164     TryCand.Reason = Reason;
    165     return true;
    166   }
    167   if (TryVal < CandVal) {
    168     if (Cand.Reason > Reason)
    169       Cand.Reason = Reason;
    170     return true;
    171   }
    172   Cand.setRepeat(Reason);
    173   return false;
    174 }
    175 
    176 // SIScheduleBlock //
    177 
    178 void SIScheduleBlock::addUnit(SUnit *SU) {
    179   NodeNum2Index[SU->NodeNum] = SUnits.size();
    180   SUnits.push_back(SU);
    181 }
    182 
    183 #ifndef NDEBUG
    184 
    185 void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
    186 
    187   dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
    188   dbgs() << '\n';
    189 }
    190 #endif
    191 
    192 void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
    193                                           SISchedCandidate &TryCand) {
    194   // Initialize the candidate if needed.
    195   if (!Cand.isValid()) {
    196     TryCand.Reason = NodeOrder;
    197     return;
    198   }
    199 
    200   if (Cand.SGPRUsage > 60 &&
    201       tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage))
    202     return;
    203 
    204   // Schedule low latency instructions as top as possible.
    205   // Order of priority is:
    206   // . Low latency instructions which do not depend on other low latency
    207   //   instructions we haven't waited for
    208   // . Other instructions which do not depend on low latency instructions
    209   //   we haven't waited for
    210   // . Low latencies
    211   // . All other instructions
    212   // Goal is to get: low latency instructions - independant instructions
    213   //     - (eventually some more low latency instructions)
    214   //     - instructions that depend on the first low latency instructions.
    215   // If in the block there is a lot of constant loads, the SGPR usage
    216   // could go quite high, thus above the arbitrary limit of 60 will encourage
    217   // use the already loaded constants (in order to release some SGPRs) before
    218   // loading more.
    219   if (tryLess(TryCand.HasLowLatencyNonWaitedParent,
    220               Cand.HasLowLatencyNonWaitedParent,
    221               TryCand, Cand, SIScheduleCandReason::Depth))
    222     return;
    223 
    224   if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
    225                  TryCand, Cand, SIScheduleCandReason::Depth))
    226     return;
    227 
    228   if (TryCand.IsLowLatency &&
    229       tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
    230               TryCand, Cand, SIScheduleCandReason::Depth))
    231     return;
    232 
    233   if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage))
    234     return;
    235 
    236   // Fall through to original instruction order.
    237   if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
    238     TryCand.Reason = NodeOrder;
    239   }
    240 }
    241 
    242 SUnit* SIScheduleBlock::pickNode() {
    243   SISchedCandidate TopCand;
    244 
    245   for (SUnit* SU : TopReadySUs) {
    246     SISchedCandidate TryCand;
    247     std::vector<unsigned> pressure;
    248     std::vector<unsigned> MaxPressure;
    249     // Predict register usage after this instruction.
    250     TryCand.SU = SU;
    251     TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
    252     TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
    253     TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
    254     TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
    255     TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
    256     TryCand.HasLowLatencyNonWaitedParent =
    257       HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
    258     tryCandidateTopDown(TopCand, TryCand);
    259     if (TryCand.Reason != NoCand)
    260       TopCand.setBest(TryCand);
    261   }
    262 
    263   return TopCand.SU;
    264 }
    265 
    266 
    267 // Schedule something valid.
    268 void SIScheduleBlock::fastSchedule() {
    269   TopReadySUs.clear();
    270   if (Scheduled)
    271     undoSchedule();
    272 
    273   for (SUnit* SU : SUnits) {
    274     if (!SU->NumPredsLeft)
    275       TopReadySUs.push_back(SU);
    276   }
    277 
    278   while (!TopReadySUs.empty()) {
    279     SUnit *SU = TopReadySUs[0];
    280     ScheduledSUnits.push_back(SU);
    281     nodeScheduled(SU);
    282   }
    283 
    284   Scheduled = true;
    285 }
    286 
    287 // Returns if the register was set between first and last.
    288 static bool isDefBetween(unsigned Reg,
    289                            SlotIndex First, SlotIndex Last,
    290                            const MachineRegisterInfo *MRI,
    291                            const LiveIntervals *LIS) {
    292   for (MachineRegisterInfo::def_instr_iterator
    293        UI = MRI->def_instr_begin(Reg),
    294        UE = MRI->def_instr_end(); UI != UE; ++UI) {
    295     const MachineInstr* MI = &*UI;
    296     if (MI->isDebugValue())
    297       continue;
    298     SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
    299     if (InstSlot >= First && InstSlot <= Last)
    300       return true;
    301   }
    302   return false;
    303 }
    304 
    305 void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
    306                                       MachineBasicBlock::iterator EndBlock) {
    307   IntervalPressure Pressure, BotPressure;
    308   RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
    309   LiveIntervals *LIS = DAG->getLIS();
    310   MachineRegisterInfo *MRI = DAG->getMRI();
    311   DAG->initRPTracker(TopRPTracker);
    312   DAG->initRPTracker(BotRPTracker);
    313   DAG->initRPTracker(RPTracker);
    314 
    315   // Goes though all SU. RPTracker captures what had to be alive for the SUs
    316   // to execute, and what is still alive at the end.
    317   for (SUnit* SU : ScheduledSUnits) {
    318     RPTracker.setPos(SU->getInstr());
    319     RPTracker.advance();
    320   }
    321 
    322   // Close the RPTracker to finalize live ins/outs.
    323   RPTracker.closeRegion();
    324 
    325   // Initialize the live ins and live outs.
    326   TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
    327   BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
    328 
    329   // Do not Track Physical Registers, because it messes up.
    330   for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
    331     if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
    332       LiveInRegs.insert(RegMaskPair.RegUnit);
    333   }
    334   LiveOutRegs.clear();
    335   // There is several possibilities to distinguish:
    336   // 1) Reg is not input to any instruction in the block, but is output of one
    337   // 2) 1) + read in the block and not needed after it
    338   // 3) 1) + read in the block but needed in another block
    339   // 4) Reg is input of an instruction but another block will read it too
    340   // 5) Reg is input of an instruction and then rewritten in the block.
    341   //    result is not read in the block (implies used in another block)
    342   // 6) Reg is input of an instruction and then rewritten in the block.
    343   //    result is read in the block and not needed in another block
    344   // 7) Reg is input of an instruction and then rewritten in the block.
    345   //    result is read in the block but also needed in another block
    346   // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
    347   // We want LiveOutRegs to contain only Regs whose content will be read after
    348   // in another block, and whose content was written in the current block,
    349   // that is we want it to get 1, 3, 5, 7
    350   // Since we made the MIs of a block to be packed all together before
    351   // scheduling, then the LiveIntervals were correct, and the RPTracker was
    352   // able to correctly handle 5 vs 6, 2 vs 3.
    353   // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
    354   // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
    355   // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
    356   // The use of findDefBetween removes the case 4.
    357   for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
    358     unsigned Reg = RegMaskPair.RegUnit;
    359     if (TargetRegisterInfo::isVirtualRegister(Reg) &&
    360         isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
    361                      LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
    362                      LIS)) {
    363       LiveOutRegs.insert(Reg);
    364     }
    365   }
    366 
    367   // Pressure = sum_alive_registers register size
    368   // Internally llvm will represent some registers as big 128 bits registers
    369   // for example, but they actually correspond to 4 actual 32 bits registers.
    370   // Thus Pressure is not equal to num_alive_registers * constant.
    371   LiveInPressure = TopPressure.MaxSetPressure;
    372   LiveOutPressure = BotPressure.MaxSetPressure;
    373 
    374   // Prepares TopRPTracker for top down scheduling.
    375   TopRPTracker.closeTop();
    376 }
    377 
    378 void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
    379                                MachineBasicBlock::iterator EndBlock) {
    380   if (!Scheduled)
    381     fastSchedule();
    382 
    383   // PreScheduling phase to set LiveIn and LiveOut.
    384   initRegPressure(BeginBlock, EndBlock);
    385   undoSchedule();
    386 
    387   // Schedule for real now.
    388 
    389   TopReadySUs.clear();
    390 
    391   for (SUnit* SU : SUnits) {
    392     if (!SU->NumPredsLeft)
    393       TopReadySUs.push_back(SU);
    394   }
    395 
    396   while (!TopReadySUs.empty()) {
    397     SUnit *SU = pickNode();
    398     ScheduledSUnits.push_back(SU);
    399     TopRPTracker.setPos(SU->getInstr());
    400     TopRPTracker.advance();
    401     nodeScheduled(SU);
    402   }
    403 
    404   // TODO: compute InternalAdditionnalPressure.
    405   InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
    406 
    407   // Check everything is right.
    408 #ifndef NDEBUG
    409   assert(SUnits.size() == ScheduledSUnits.size() &&
    410             TopReadySUs.empty());
    411   for (SUnit* SU : SUnits) {
    412     assert(SU->isScheduled &&
    413               SU->NumPredsLeft == 0);
    414   }
    415 #endif
    416 
    417   Scheduled = true;
    418 }
    419 
    420 void SIScheduleBlock::undoSchedule() {
    421   for (SUnit* SU : SUnits) {
    422     SU->isScheduled = false;
    423     for (SDep& Succ : SU->Succs) {
    424       if (BC->isSUInBlock(Succ.getSUnit(), ID))
    425         undoReleaseSucc(SU, &Succ);
    426     }
    427   }
    428   HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
    429   ScheduledSUnits.clear();
    430   Scheduled = false;
    431 }
    432 
    433 void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
    434   SUnit *SuccSU = SuccEdge->getSUnit();
    435 
    436   if (SuccEdge->isWeak()) {
    437     ++SuccSU->WeakPredsLeft;
    438     return;
    439   }
    440   ++SuccSU->NumPredsLeft;
    441 }
    442 
    443 void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
    444   SUnit *SuccSU = SuccEdge->getSUnit();
    445 
    446   if (SuccEdge->isWeak()) {
    447     --SuccSU->WeakPredsLeft;
    448     return;
    449   }
    450 #ifndef NDEBUG
    451   if (SuccSU->NumPredsLeft == 0) {
    452     dbgs() << "*** Scheduling failed! ***\n";
    453     SuccSU->dump(DAG);
    454     dbgs() << " has been released too many times!\n";
    455     llvm_unreachable(nullptr);
    456   }
    457 #endif
    458 
    459   --SuccSU->NumPredsLeft;
    460 }
    461 
    462 /// Release Successors of the SU that are in the block or not.
    463 void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
    464   for (SDep& Succ : SU->Succs) {
    465     SUnit *SuccSU = Succ.getSUnit();
    466 
    467     if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
    468       continue;
    469 
    470     releaseSucc(SU, &Succ);
    471     if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
    472       TopReadySUs.push_back(SuccSU);
    473   }
    474 }
    475 
    476 void SIScheduleBlock::nodeScheduled(SUnit *SU) {
    477   // Is in TopReadySUs
    478   assert (!SU->NumPredsLeft);
    479   std::vector<SUnit*>::iterator I =
    480     std::find(TopReadySUs.begin(), TopReadySUs.end(), SU);
    481   if (I == TopReadySUs.end()) {
    482     dbgs() << "Data Structure Bug in SI Scheduler\n";
    483     llvm_unreachable(nullptr);
    484   }
    485   TopReadySUs.erase(I);
    486 
    487   releaseSuccessors(SU, true);
    488   // Scheduling this node will trigger a wait,
    489   // thus propagate to other instructions that they do not need to wait either.
    490   if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
    491     HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
    492 
    493   if (DAG->IsLowLatencySU[SU->NodeNum]) {
    494      for (SDep& Succ : SU->Succs) {
    495       std::map<unsigned, unsigned>::iterator I =
    496         NodeNum2Index.find(Succ.getSUnit()->NodeNum);
    497       if (I != NodeNum2Index.end())
    498         HasLowLatencyNonWaitedParent[I->second] = 1;
    499     }
    500   }
    501   SU->isScheduled = true;
    502 }
    503 
    504 void SIScheduleBlock::finalizeUnits() {
    505   // We remove links from outside blocks to enable scheduling inside the block.
    506   for (SUnit* SU : SUnits) {
    507     releaseSuccessors(SU, false);
    508     if (DAG->IsHighLatencySU[SU->NodeNum])
    509       HighLatencyBlock = true;
    510   }
    511   HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
    512 }
    513 
    514 // we maintain ascending order of IDs
    515 void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
    516   unsigned PredID = Pred->getID();
    517 
    518   // Check if not already predecessor.
    519   for (SIScheduleBlock* P : Preds) {
    520     if (PredID == P->getID())
    521       return;
    522   }
    523   Preds.push_back(Pred);
    524 
    525   assert(none_of(Succs,
    526                  [=](SIScheduleBlock *S) { return PredID == S->getID(); }) &&
    527          "Loop in the Block Graph!");
    528 }
    529 
    530 void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) {
    531   unsigned SuccID = Succ->getID();
    532 
    533   // Check if not already predecessor.
    534   for (SIScheduleBlock* S : Succs) {
    535     if (SuccID == S->getID())
    536       return;
    537   }
    538   if (Succ->isHighLatencyBlock())
    539     ++NumHighLatencySuccessors;
    540   Succs.push_back(Succ);
    541   assert(none_of(Preds,
    542                  [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
    543          "Loop in the Block Graph!");
    544 }
    545 
    546 #ifndef NDEBUG
    547 void SIScheduleBlock::printDebug(bool full) {
    548   dbgs() << "Block (" << ID << ")\n";
    549   if (!full)
    550     return;
    551 
    552   dbgs() << "\nContains High Latency Instruction: "
    553          << HighLatencyBlock << '\n';
    554   dbgs() << "\nDepends On:\n";
    555   for (SIScheduleBlock* P : Preds) {
    556     P->printDebug(false);
    557   }
    558 
    559   dbgs() << "\nSuccessors:\n";
    560   for (SIScheduleBlock* S : Succs) {
    561     S->printDebug(false);
    562   }
    563 
    564   if (Scheduled) {
    565     dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
    566            << LiveInPressure[DAG->getVGPRSetID()] << '\n';
    567     dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
    568            << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
    569     dbgs() << "LiveIns:\n";
    570     for (unsigned Reg : LiveInRegs)
    571       dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
    572 
    573     dbgs() << "\nLiveOuts:\n";
    574     for (unsigned Reg : LiveOutRegs)
    575       dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
    576   }
    577 
    578   dbgs() << "\nInstructions:\n";
    579   if (!Scheduled) {
    580     for (SUnit* SU : SUnits) {
    581       SU->dump(DAG);
    582     }
    583   } else {
    584     for (SUnit* SU : SUnits) {
    585       SU->dump(DAG);
    586     }
    587   }
    588 
    589    dbgs() << "///////////////////////\n";
    590 }
    591 
    592 #endif
    593 
    594 // SIScheduleBlockCreator //
    595 
    596 SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
    597 DAG(DAG) {
    598 }
    599 
    600 SIScheduleBlockCreator::~SIScheduleBlockCreator() {
    601 }
    602 
    603 SIScheduleBlocks
    604 SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
    605   std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
    606     Blocks.find(BlockVariant);
    607   if (B == Blocks.end()) {
    608     SIScheduleBlocks Res;
    609     createBlocksForVariant(BlockVariant);
    610     topologicalSort();
    611     scheduleInsideBlocks();
    612     fillStats();
    613     Res.Blocks = CurrentBlocks;
    614     Res.TopDownIndex2Block = TopDownIndex2Block;
    615     Res.TopDownBlock2Index = TopDownBlock2Index;
    616     Blocks[BlockVariant] = Res;
    617     return Res;
    618   } else {
    619     return B->second;
    620   }
    621 }
    622 
    623 bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
    624   if (SU->NodeNum >= DAG->SUnits.size())
    625     return false;
    626   return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
    627 }
    628 
    629 void SIScheduleBlockCreator::colorHighLatenciesAlone() {
    630   unsigned DAGSize = DAG->SUnits.size();
    631 
    632   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    633     SUnit *SU = &DAG->SUnits[i];
    634     if (DAG->IsHighLatencySU[SU->NodeNum]) {
    635       CurrentColoring[SU->NodeNum] = NextReservedID++;
    636     }
    637   }
    638 }
    639 
    640 void SIScheduleBlockCreator::colorHighLatenciesGroups() {
    641   unsigned DAGSize = DAG->SUnits.size();
    642   unsigned NumHighLatencies = 0;
    643   unsigned GroupSize;
    644   unsigned Color = NextReservedID;
    645   unsigned Count = 0;
    646   std::set<unsigned> FormingGroup;
    647 
    648   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    649     SUnit *SU = &DAG->SUnits[i];
    650     if (DAG->IsHighLatencySU[SU->NodeNum])
    651       ++NumHighLatencies;
    652   }
    653 
    654   if (NumHighLatencies == 0)
    655     return;
    656 
    657   if (NumHighLatencies <= 6)
    658     GroupSize = 2;
    659   else if (NumHighLatencies <= 12)
    660     GroupSize = 3;
    661   else
    662     GroupSize = 4;
    663 
    664   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    665     SUnit *SU = &DAG->SUnits[i];
    666     if (DAG->IsHighLatencySU[SU->NodeNum]) {
    667       unsigned CompatibleGroup = true;
    668       unsigned ProposedColor = Color;
    669       for (unsigned j : FormingGroup) {
    670         // TODO: Currently CompatibleGroup will always be false,
    671         // because the graph enforces the load order. This
    672         // can be fixed, but as keeping the load order is often
    673         // good for performance that causes a performance hit (both
    674         // the default scheduler and this scheduler).
    675         // When this scheduler determines a good load order,
    676         // this can be fixed.
    677         if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) ||
    678             !DAG->canAddEdge(&DAG->SUnits[j], SU))
    679           CompatibleGroup = false;
    680       }
    681       if (!CompatibleGroup || ++Count == GroupSize) {
    682         FormingGroup.clear();
    683         Color = ++NextReservedID;
    684         if (!CompatibleGroup) {
    685           ProposedColor = Color;
    686           FormingGroup.insert(SU->NodeNum);
    687         }
    688         Count = 0;
    689       } else {
    690         FormingGroup.insert(SU->NodeNum);
    691       }
    692       CurrentColoring[SU->NodeNum] = ProposedColor;
    693     }
    694   }
    695 }
    696 
    697 void SIScheduleBlockCreator::colorComputeReservedDependencies() {
    698   unsigned DAGSize = DAG->SUnits.size();
    699   std::map<std::set<unsigned>, unsigned> ColorCombinations;
    700 
    701   CurrentTopDownReservedDependencyColoring.clear();
    702   CurrentBottomUpReservedDependencyColoring.clear();
    703 
    704   CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
    705   CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
    706 
    707   // Traverse TopDown, and give different colors to SUs depending
    708   // on which combination of High Latencies they depend on.
    709 
    710   for (unsigned SUNum : DAG->TopDownIndex2SU) {
    711     SUnit *SU = &DAG->SUnits[SUNum];
    712     std::set<unsigned> SUColors;
    713 
    714     // Already given.
    715     if (CurrentColoring[SU->NodeNum]) {
    716       CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
    717         CurrentColoring[SU->NodeNum];
    718       continue;
    719     }
    720 
    721    for (SDep& PredDep : SU->Preds) {
    722       SUnit *Pred = PredDep.getSUnit();
    723       if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
    724         continue;
    725       if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
    726         SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
    727     }
    728     // Color 0 by default.
    729     if (SUColors.empty())
    730       continue;
    731     // Same color than parents.
    732     if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
    733       CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
    734         *SUColors.begin();
    735     else {
    736       std::map<std::set<unsigned>, unsigned>::iterator Pos =
    737         ColorCombinations.find(SUColors);
    738       if (Pos != ColorCombinations.end()) {
    739           CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
    740       } else {
    741         CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
    742           NextNonReservedID;
    743         ColorCombinations[SUColors] = NextNonReservedID++;
    744       }
    745     }
    746   }
    747 
    748   ColorCombinations.clear();
    749 
    750   // Same as before, but BottomUp.
    751 
    752   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    753     SUnit *SU = &DAG->SUnits[SUNum];
    754     std::set<unsigned> SUColors;
    755 
    756     // Already given.
    757     if (CurrentColoring[SU->NodeNum]) {
    758       CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
    759         CurrentColoring[SU->NodeNum];
    760       continue;
    761     }
    762 
    763     for (SDep& SuccDep : SU->Succs) {
    764       SUnit *Succ = SuccDep.getSUnit();
    765       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    766         continue;
    767       if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
    768         SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
    769     }
    770     // Keep color 0.
    771     if (SUColors.empty())
    772       continue;
    773     // Same color than parents.
    774     if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
    775       CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
    776         *SUColors.begin();
    777     else {
    778       std::map<std::set<unsigned>, unsigned>::iterator Pos =
    779         ColorCombinations.find(SUColors);
    780       if (Pos != ColorCombinations.end()) {
    781         CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
    782       } else {
    783         CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
    784           NextNonReservedID;
    785         ColorCombinations[SUColors] = NextNonReservedID++;
    786       }
    787     }
    788   }
    789 }
    790 
    791 void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
    792   unsigned DAGSize = DAG->SUnits.size();
    793   std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
    794 
    795   // Every combination of colors given by the top down
    796   // and bottom up Reserved node dependency
    797 
    798   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    799     SUnit *SU = &DAG->SUnits[i];
    800     std::pair<unsigned, unsigned> SUColors;
    801 
    802     // High latency instructions: already given.
    803     if (CurrentColoring[SU->NodeNum])
    804       continue;
    805 
    806     SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
    807     SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
    808 
    809     std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
    810       ColorCombinations.find(SUColors);
    811     if (Pos != ColorCombinations.end()) {
    812       CurrentColoring[SU->NodeNum] = Pos->second;
    813     } else {
    814       CurrentColoring[SU->NodeNum] = NextNonReservedID;
    815       ColorCombinations[SUColors] = NextNonReservedID++;
    816     }
    817   }
    818 }
    819 
    820 void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
    821   unsigned DAGSize = DAG->SUnits.size();
    822   std::vector<int> PendingColoring = CurrentColoring;
    823 
    824   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    825     SUnit *SU = &DAG->SUnits[SUNum];
    826     std::set<unsigned> SUColors;
    827     std::set<unsigned> SUColorsPending;
    828 
    829     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    830       continue;
    831 
    832     if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
    833         CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
    834       continue;
    835 
    836     for (SDep& SuccDep : SU->Succs) {
    837       SUnit *Succ = SuccDep.getSUnit();
    838       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    839         continue;
    840       if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
    841           CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
    842         SUColors.insert(CurrentColoring[Succ->NodeNum]);
    843       SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
    844     }
    845     if (SUColors.size() == 1 && SUColorsPending.size() == 1)
    846       PendingColoring[SU->NodeNum] = *SUColors.begin();
    847     else // TODO: Attribute new colors depending on color
    848          // combination of children.
    849       PendingColoring[SU->NodeNum] = NextNonReservedID++;
    850   }
    851   CurrentColoring = PendingColoring;
    852 }
    853 
    854 
    855 void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
    856   unsigned DAGSize = DAG->SUnits.size();
    857   unsigned PreviousColor;
    858   std::set<unsigned> SeenColors;
    859 
    860   if (DAGSize <= 1)
    861     return;
    862 
    863   PreviousColor = CurrentColoring[0];
    864 
    865   for (unsigned i = 1, e = DAGSize; i != e; ++i) {
    866     SUnit *SU = &DAG->SUnits[i];
    867     unsigned CurrentColor = CurrentColoring[i];
    868     unsigned PreviousColorSave = PreviousColor;
    869     assert(i == SU->NodeNum);
    870 
    871     if (CurrentColor != PreviousColor)
    872       SeenColors.insert(PreviousColor);
    873     PreviousColor = CurrentColor;
    874 
    875     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    876       continue;
    877 
    878     if (SeenColors.find(CurrentColor) == SeenColors.end())
    879       continue;
    880 
    881     if (PreviousColorSave != CurrentColor)
    882       CurrentColoring[i] = NextNonReservedID++;
    883     else
    884       CurrentColoring[i] = CurrentColoring[i-1];
    885   }
    886 }
    887 
    888 void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
    889   unsigned DAGSize = DAG->SUnits.size();
    890 
    891   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    892     SUnit *SU = &DAG->SUnits[SUNum];
    893     std::set<unsigned> SUColors;
    894 
    895     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    896       continue;
    897 
    898     // No predecessor: Vgpr constant loading.
    899     // Low latency instructions usually have a predecessor (the address)
    900     if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
    901       continue;
    902 
    903     for (SDep& SuccDep : SU->Succs) {
    904       SUnit *Succ = SuccDep.getSUnit();
    905       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    906         continue;
    907       SUColors.insert(CurrentColoring[Succ->NodeNum]);
    908     }
    909     if (SUColors.size() == 1)
    910       CurrentColoring[SU->NodeNum] = *SUColors.begin();
    911   }
    912 }
    913 
    914 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
    915   unsigned DAGSize = DAG->SUnits.size();
    916 
    917   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    918     SUnit *SU = &DAG->SUnits[SUNum];
    919     std::set<unsigned> SUColors;
    920 
    921     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    922       continue;
    923 
    924     for (SDep& SuccDep : SU->Succs) {
    925        SUnit *Succ = SuccDep.getSUnit();
    926       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    927         continue;
    928       SUColors.insert(CurrentColoring[Succ->NodeNum]);
    929     }
    930     if (SUColors.size() == 1)
    931       CurrentColoring[SU->NodeNum] = *SUColors.begin();
    932   }
    933 }
    934 
    935 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
    936   unsigned DAGSize = DAG->SUnits.size();
    937 
    938   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    939     SUnit *SU = &DAG->SUnits[SUNum];
    940     std::set<unsigned> SUColors;
    941 
    942     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    943       continue;
    944 
    945     for (SDep& SuccDep : SU->Succs) {
    946        SUnit *Succ = SuccDep.getSUnit();
    947       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    948         continue;
    949       SUColors.insert(CurrentColoring[Succ->NodeNum]);
    950     }
    951     if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
    952       CurrentColoring[SU->NodeNum] = *SUColors.begin();
    953   }
    954 }
    955 
    956 void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
    957   unsigned DAGSize = DAG->SUnits.size();
    958   std::map<unsigned, unsigned> ColorCount;
    959 
    960   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    961     SUnit *SU = &DAG->SUnits[SUNum];
    962     unsigned color = CurrentColoring[SU->NodeNum];
    963     std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color);
    964       if (Pos != ColorCount.end()) {
    965         ++ColorCount[color];
    966       } else {
    967         ColorCount[color] = 1;
    968       }
    969   }
    970 
    971   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    972     SUnit *SU = &DAG->SUnits[SUNum];
    973     unsigned color = CurrentColoring[SU->NodeNum];
    974     std::set<unsigned> SUColors;
    975 
    976     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    977       continue;
    978 
    979     if (ColorCount[color] > 1)
    980       continue;
    981 
    982     for (SDep& SuccDep : SU->Succs) {
    983        SUnit *Succ = SuccDep.getSUnit();
    984       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    985         continue;
    986       SUColors.insert(CurrentColoring[Succ->NodeNum]);
    987     }
    988     if (SUColors.size() == 1 && *SUColors.begin() != color) {
    989       --ColorCount[color];
    990       CurrentColoring[SU->NodeNum] = *SUColors.begin();
    991       ++ColorCount[*SUColors.begin()];
    992     }
    993   }
    994 }
    995 
    996 void SIScheduleBlockCreator::cutHugeBlocks() {
    997   // TODO
    998 }
    999 
   1000 void SIScheduleBlockCreator::regroupNoUserInstructions() {
   1001   unsigned DAGSize = DAG->SUnits.size();
   1002   int GroupID = NextNonReservedID++;
   1003 
   1004   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
   1005     SUnit *SU = &DAG->SUnits[SUNum];
   1006     bool hasSuccessor = false;
   1007 
   1008     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
   1009       continue;
   1010 
   1011     for (SDep& SuccDep : SU->Succs) {
   1012        SUnit *Succ = SuccDep.getSUnit();
   1013       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
   1014         continue;
   1015       hasSuccessor = true;
   1016     }
   1017     if (!hasSuccessor)
   1018       CurrentColoring[SU->NodeNum] = GroupID;
   1019   }
   1020 }
   1021 
   1022 void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
   1023   unsigned DAGSize = DAG->SUnits.size();
   1024   std::map<unsigned,unsigned> RealID;
   1025 
   1026   CurrentBlocks.clear();
   1027   CurrentColoring.clear();
   1028   CurrentColoring.resize(DAGSize, 0);
   1029   Node2CurrentBlock.clear();
   1030 
   1031   // Restore links previous scheduling variant has overridden.
   1032   DAG->restoreSULinksLeft();
   1033 
   1034   NextReservedID = 1;
   1035   NextNonReservedID = DAGSize + 1;
   1036 
   1037   DEBUG(dbgs() << "Coloring the graph\n");
   1038 
   1039   if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
   1040     colorHighLatenciesGroups();
   1041   else
   1042     colorHighLatenciesAlone();
   1043   colorComputeReservedDependencies();
   1044   colorAccordingToReservedDependencies();
   1045   colorEndsAccordingToDependencies();
   1046   if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
   1047     colorForceConsecutiveOrderInGroup();
   1048   regroupNoUserInstructions();
   1049   colorMergeConstantLoadsNextGroup();
   1050   colorMergeIfPossibleNextGroupOnlyForReserved();
   1051 
   1052   // Put SUs of same color into same block
   1053   Node2CurrentBlock.resize(DAGSize, -1);
   1054   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
   1055     SUnit *SU = &DAG->SUnits[i];
   1056     unsigned Color = CurrentColoring[SU->NodeNum];
   1057     if (RealID.find(Color) == RealID.end()) {
   1058       int ID = CurrentBlocks.size();
   1059       BlockPtrs.push_back(
   1060         make_unique<SIScheduleBlock>(DAG, this, ID));
   1061       CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
   1062       RealID[Color] = ID;
   1063     }
   1064     CurrentBlocks[RealID[Color]]->addUnit(SU);
   1065     Node2CurrentBlock[SU->NodeNum] = RealID[Color];
   1066   }
   1067 
   1068   // Build dependencies between blocks.
   1069   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
   1070     SUnit *SU = &DAG->SUnits[i];
   1071     int SUID = Node2CurrentBlock[i];
   1072      for (SDep& SuccDep : SU->Succs) {
   1073        SUnit *Succ = SuccDep.getSUnit();
   1074       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
   1075         continue;
   1076       if (Node2CurrentBlock[Succ->NodeNum] != SUID)
   1077         CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]);
   1078     }
   1079     for (SDep& PredDep : SU->Preds) {
   1080       SUnit *Pred = PredDep.getSUnit();
   1081       if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
   1082         continue;
   1083       if (Node2CurrentBlock[Pred->NodeNum] != SUID)
   1084         CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
   1085     }
   1086   }
   1087 
   1088   // Free root and leafs of all blocks to enable scheduling inside them.
   1089   for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
   1090     SIScheduleBlock *Block = CurrentBlocks[i];
   1091     Block->finalizeUnits();
   1092   }
   1093   DEBUG(
   1094     dbgs() << "Blocks created:\n\n";
   1095     for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
   1096       SIScheduleBlock *Block = CurrentBlocks[i];
   1097       Block->printDebug(true);
   1098     }
   1099   );
   1100 }
   1101 
   1102 // Two functions taken from Codegen/MachineScheduler.cpp
   1103 
   1104 /// If this iterator is a debug value, increment until reaching the End or a
   1105 /// non-debug instruction.
   1106 static MachineBasicBlock::const_iterator
   1107 nextIfDebug(MachineBasicBlock::const_iterator I,
   1108             MachineBasicBlock::const_iterator End) {
   1109   for(; I != End; ++I) {
   1110     if (!I->isDebugValue())
   1111       break;
   1112   }
   1113   return I;
   1114 }
   1115 
   1116 /// Non-const version.
   1117 static MachineBasicBlock::iterator
   1118 nextIfDebug(MachineBasicBlock::iterator I,
   1119             MachineBasicBlock::const_iterator End) {
   1120   // Cast the return value to nonconst MachineInstr, then cast to an
   1121   // instr_iterator, which does not check for null, finally return a
   1122   // bundle_iterator.
   1123   return MachineBasicBlock::instr_iterator(
   1124     const_cast<MachineInstr*>(
   1125       &*nextIfDebug(MachineBasicBlock::const_iterator(I), End)));
   1126 }
   1127 
   1128 void SIScheduleBlockCreator::topologicalSort() {
   1129   unsigned DAGSize = CurrentBlocks.size();
   1130   std::vector<int> WorkList;
   1131 
   1132   DEBUG(dbgs() << "Topological Sort\n");
   1133 
   1134   WorkList.reserve(DAGSize);
   1135   TopDownIndex2Block.resize(DAGSize);
   1136   TopDownBlock2Index.resize(DAGSize);
   1137   BottomUpIndex2Block.resize(DAGSize);
   1138 
   1139   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
   1140     SIScheduleBlock *Block = CurrentBlocks[i];
   1141     unsigned Degree = Block->getSuccs().size();
   1142     TopDownBlock2Index[i] = Degree;
   1143     if (Degree == 0) {
   1144       WorkList.push_back(i);
   1145     }
   1146   }
   1147 
   1148   int Id = DAGSize;
   1149   while (!WorkList.empty()) {
   1150     int i = WorkList.back();
   1151     SIScheduleBlock *Block = CurrentBlocks[i];
   1152     WorkList.pop_back();
   1153     TopDownBlock2Index[i] = --Id;
   1154     TopDownIndex2Block[Id] = i;
   1155     for (SIScheduleBlock* Pred : Block->getPreds()) {
   1156       if (!--TopDownBlock2Index[Pred->getID()])
   1157         WorkList.push_back(Pred->getID());
   1158     }
   1159   }
   1160 
   1161 #ifndef NDEBUG
   1162   // Check correctness of the ordering.
   1163   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
   1164     SIScheduleBlock *Block = CurrentBlocks[i];
   1165     for (SIScheduleBlock* Pred : Block->getPreds()) {
   1166       assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
   1167       "Wrong Top Down topological sorting");
   1168     }
   1169   }
   1170 #endif
   1171 
   1172   BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
   1173                                          TopDownIndex2Block.rend());
   1174 }
   1175 
   1176 void SIScheduleBlockCreator::scheduleInsideBlocks() {
   1177   unsigned DAGSize = CurrentBlocks.size();
   1178 
   1179   DEBUG(dbgs() << "\nScheduling Blocks\n\n");
   1180 
   1181   // We do schedule a valid scheduling such that a Block corresponds
   1182   // to a range of instructions.
   1183   DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
   1184   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
   1185     SIScheduleBlock *Block = CurrentBlocks[i];
   1186     Block->fastSchedule();
   1187   }
   1188 
   1189   // Note: the following code, and the part restoring previous position
   1190   // is by far the most expensive operation of the Scheduler.
   1191 
   1192   // Do not update CurrentTop.
   1193   MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
   1194   std::vector<MachineBasicBlock::iterator> PosOld;
   1195   std::vector<MachineBasicBlock::iterator> PosNew;
   1196   PosOld.reserve(DAG->SUnits.size());
   1197   PosNew.reserve(DAG->SUnits.size());
   1198 
   1199   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
   1200     int BlockIndice = TopDownIndex2Block[i];
   1201     SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
   1202     std::vector<SUnit*> SUs = Block->getScheduledUnits();
   1203 
   1204     for (SUnit* SU : SUs) {
   1205       MachineInstr *MI = SU->getInstr();
   1206       MachineBasicBlock::iterator Pos = MI;
   1207       PosOld.push_back(Pos);
   1208       if (&*CurrentTopFastSched == MI) {
   1209         PosNew.push_back(Pos);
   1210         CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
   1211                                           DAG->getCurrentBottom());
   1212       } else {
   1213         // Update the instruction stream.
   1214         DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
   1215 
   1216         // Update LiveIntervals.
   1217         // Note: Moving all instructions and calling handleMove everytime
   1218         // is the most cpu intensive operation of the scheduler.
   1219         // It would gain a lot if there was a way to recompute the
   1220         // LiveIntervals for the entire scheduling region.
   1221         DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
   1222         PosNew.push_back(CurrentTopFastSched);
   1223       }
   1224     }
   1225   }
   1226 
   1227   // Now we have Block of SUs == Block of MI.
   1228   // We do the final schedule for the instructions inside the block.
   1229   // The property that all the SUs of the Block are grouped together as MI
   1230   // is used for correct reg usage tracking.
   1231   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
   1232     SIScheduleBlock *Block = CurrentBlocks[i];
   1233     std::vector<SUnit*> SUs = Block->getScheduledUnits();
   1234     Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
   1235   }
   1236 
   1237   DEBUG(dbgs() << "Restoring MI Pos\n");
   1238   // Restore old ordering (which prevents a LIS->handleMove bug).
   1239   for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
   1240     MachineBasicBlock::iterator POld = PosOld[i-1];
   1241     MachineBasicBlock::iterator PNew = PosNew[i-1];
   1242     if (PNew != POld) {
   1243       // Update the instruction stream.
   1244       DAG->getBB()->splice(POld, DAG->getBB(), PNew);
   1245 
   1246       // Update LiveIntervals.
   1247       DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
   1248     }
   1249   }
   1250 
   1251   DEBUG(
   1252     for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
   1253       SIScheduleBlock *Block = CurrentBlocks[i];
   1254       Block->printDebug(true);
   1255     }
   1256   );
   1257 }
   1258 
   1259 void SIScheduleBlockCreator::fillStats() {
   1260   unsigned DAGSize = CurrentBlocks.size();
   1261 
   1262   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
   1263     int BlockIndice = TopDownIndex2Block[i];
   1264     SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
   1265     if (Block->getPreds().size() == 0)
   1266       Block->Depth = 0;
   1267     else {
   1268       unsigned Depth = 0;
   1269       for (SIScheduleBlock *Pred : Block->getPreds()) {
   1270         if (Depth < Pred->Depth + 1)
   1271           Depth = Pred->Depth + 1;
   1272       }
   1273       Block->Depth = Depth;
   1274     }
   1275   }
   1276 
   1277   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
   1278     int BlockIndice = BottomUpIndex2Block[i];
   1279     SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
   1280     if (Block->getSuccs().size() == 0)
   1281       Block->Height = 0;
   1282     else {
   1283       unsigned Height = 0;
   1284       for (SIScheduleBlock *Succ : Block->getSuccs()) {
   1285         if (Height < Succ->Height + 1)
   1286           Height = Succ->Height + 1;
   1287       }
   1288       Block->Height = Height;
   1289     }
   1290   }
   1291 }
   1292 
   1293 // SIScheduleBlockScheduler //
   1294 
   1295 SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
   1296                                                    SISchedulerBlockSchedulerVariant Variant,
   1297                                                    SIScheduleBlocks  BlocksStruct) :
   1298   DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
   1299   LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
   1300   SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
   1301 
   1302   // Fill the usage of every output
   1303   // Warning: while by construction we always have a link between two blocks
   1304   // when one needs a result from the other, the number of users of an output
   1305   // is not the sum of child blocks having as input the same virtual register.
   1306   // Here is an example. A produces x and y. B eats x and produces x'.
   1307   // C eats x' and y. The register coalescer may have attributed the same
   1308   // virtual register to x and x'.
   1309   // To count accurately, we do a topological sort. In case the register is
   1310   // found for several parents, we increment the usage of the one with the
   1311   // highest topological index.
   1312   LiveOutRegsNumUsages.resize(Blocks.size());
   1313   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
   1314     SIScheduleBlock *Block = Blocks[i];
   1315     for (unsigned Reg : Block->getInRegs()) {
   1316       bool Found = false;
   1317       int topoInd = -1;
   1318       for (SIScheduleBlock* Pred: Block->getPreds()) {
   1319         std::set<unsigned> PredOutRegs = Pred->getOutRegs();
   1320         std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
   1321 
   1322         if (RegPos != PredOutRegs.end()) {
   1323           Found = true;
   1324           if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
   1325             topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
   1326           }
   1327         }
   1328       }
   1329 
   1330       if (!Found)
   1331         continue;
   1332 
   1333       int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
   1334       std::map<unsigned, unsigned>::iterator RegPos =
   1335         LiveOutRegsNumUsages[PredID].find(Reg);
   1336       if (RegPos != LiveOutRegsNumUsages[PredID].end()) {
   1337         ++LiveOutRegsNumUsages[PredID][Reg];
   1338       } else {
   1339         LiveOutRegsNumUsages[PredID][Reg] = 1;
   1340       }
   1341     }
   1342   }
   1343 
   1344   LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
   1345   BlockNumPredsLeft.resize(Blocks.size());
   1346   BlockNumSuccsLeft.resize(Blocks.size());
   1347 
   1348   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
   1349     SIScheduleBlock *Block = Blocks[i];
   1350     BlockNumPredsLeft[i] = Block->getPreds().size();
   1351     BlockNumSuccsLeft[i] = Block->getSuccs().size();
   1352   }
   1353 
   1354 #ifndef NDEBUG
   1355   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
   1356     SIScheduleBlock *Block = Blocks[i];
   1357     assert(Block->getID() == i);
   1358   }
   1359 #endif
   1360 
   1361   std::set<unsigned> InRegs = DAG->getInRegs();
   1362   addLiveRegs(InRegs);
   1363 
   1364   // Fill LiveRegsConsumers for regs that were already
   1365   // defined before scheduling.
   1366   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
   1367     SIScheduleBlock *Block = Blocks[i];
   1368     for (unsigned Reg : Block->getInRegs()) {
   1369       bool Found = false;
   1370       for (SIScheduleBlock* Pred: Block->getPreds()) {
   1371         std::set<unsigned> PredOutRegs = Pred->getOutRegs();
   1372         std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
   1373 
   1374         if (RegPos != PredOutRegs.end()) {
   1375           Found = true;
   1376           break;
   1377         }
   1378       }
   1379 
   1380       if (!Found) {
   1381         if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end())
   1382           LiveRegsConsumers[Reg] = 1;
   1383         else
   1384           ++LiveRegsConsumers[Reg];
   1385       }
   1386     }
   1387   }
   1388 
   1389   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
   1390     SIScheduleBlock *Block = Blocks[i];
   1391     if (BlockNumPredsLeft[i] == 0) {
   1392       ReadyBlocks.push_back(Block);
   1393     }
   1394   }
   1395 
   1396   while (SIScheduleBlock *Block = pickBlock()) {
   1397     BlocksScheduled.push_back(Block);
   1398     blockScheduled(Block);
   1399   }
   1400 
   1401   DEBUG(
   1402     dbgs() << "Block Order:";
   1403     for (SIScheduleBlock* Block : BlocksScheduled) {
   1404       dbgs() << ' ' << Block->getID();
   1405     }
   1406   );
   1407 }
   1408 
   1409 bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
   1410                                                    SIBlockSchedCandidate &TryCand) {
   1411   if (!Cand.isValid()) {
   1412     TryCand.Reason = NodeOrder;
   1413     return true;
   1414   }
   1415 
   1416   // Try to hide high latencies.
   1417   if (tryLess(TryCand.LastPosHighLatParentScheduled,
   1418               Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
   1419     return true;
   1420   // Schedule high latencies early so you can hide them better.
   1421   if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
   1422                  TryCand, Cand, Latency))
   1423     return true;
   1424   if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height,
   1425                                           TryCand, Cand, Depth))
   1426     return true;
   1427   if (tryGreater(TryCand.NumHighLatencySuccessors,
   1428                  Cand.NumHighLatencySuccessors,
   1429                  TryCand, Cand, Successor))
   1430     return true;
   1431   return false;
   1432 }
   1433 
   1434 bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
   1435                                                     SIBlockSchedCandidate &TryCand) {
   1436   if (!Cand.isValid()) {
   1437     TryCand.Reason = NodeOrder;
   1438     return true;
   1439   }
   1440 
   1441   if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
   1442               TryCand, Cand, RegUsage))
   1443     return true;
   1444   if (tryGreater(TryCand.NumSuccessors > 0,
   1445                  Cand.NumSuccessors > 0,
   1446                  TryCand, Cand, Successor))
   1447     return true;
   1448   if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
   1449     return true;
   1450   if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
   1451               TryCand, Cand, RegUsage))
   1452     return true;
   1453   return false;
   1454 }
   1455 
   1456 SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
   1457   SIBlockSchedCandidate Cand;
   1458   std::vector<SIScheduleBlock*>::iterator Best;
   1459   SIScheduleBlock *Block;
   1460   if (ReadyBlocks.empty())
   1461     return nullptr;
   1462 
   1463   DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
   1464                         VregCurrentUsage, SregCurrentUsage);
   1465   if (VregCurrentUsage > maxVregUsage)
   1466     maxVregUsage = VregCurrentUsage;
   1467   if (VregCurrentUsage > maxSregUsage)
   1468     maxSregUsage = VregCurrentUsage;
   1469   DEBUG(
   1470     dbgs() << "Picking New Blocks\n";
   1471     dbgs() << "Available: ";
   1472     for (SIScheduleBlock* Block : ReadyBlocks)
   1473       dbgs() << Block->getID() << ' ';
   1474     dbgs() << "\nCurrent Live:\n";
   1475     for (unsigned Reg : LiveRegs)
   1476       dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
   1477     dbgs() << '\n';
   1478     dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
   1479     dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
   1480   );
   1481 
   1482   Cand.Block = nullptr;
   1483   for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
   1484        E = ReadyBlocks.end(); I != E; ++I) {
   1485     SIBlockSchedCandidate TryCand;
   1486     TryCand.Block = *I;
   1487     TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
   1488     TryCand.VGPRUsageDiff =
   1489       checkRegUsageImpact(TryCand.Block->getInRegs(),
   1490                           TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
   1491     TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
   1492     TryCand.NumHighLatencySuccessors =
   1493       TryCand.Block->getNumHighLatencySuccessors();
   1494     TryCand.LastPosHighLatParentScheduled =
   1495       (unsigned int) std::max<int> (0,
   1496          LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
   1497            LastPosWaitedHighLatency);
   1498     TryCand.Height = TryCand.Block->Height;
   1499     // Try not to increase VGPR usage too much, else we may spill.
   1500     if (VregCurrentUsage > 120 ||
   1501         Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
   1502       if (!tryCandidateRegUsage(Cand, TryCand) &&
   1503           Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
   1504         tryCandidateLatency(Cand, TryCand);
   1505     } else {
   1506       if (!tryCandidateLatency(Cand, TryCand))
   1507         tryCandidateRegUsage(Cand, TryCand);
   1508     }
   1509     if (TryCand.Reason != NoCand) {
   1510       Cand.setBest(TryCand);
   1511       Best = I;
   1512       DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
   1513                    << getReasonStr(Cand.Reason) << '\n');
   1514     }
   1515   }
   1516 
   1517   DEBUG(
   1518     dbgs() << "Picking: " << Cand.Block->getID() << '\n';
   1519     dbgs() << "Is a block with high latency instruction: "
   1520       << (Cand.IsHighLatency ? "yes\n" : "no\n");
   1521     dbgs() << "Position of last high latency dependency: "
   1522            << Cand.LastPosHighLatParentScheduled << '\n';
   1523     dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
   1524     dbgs() << '\n';
   1525   );
   1526 
   1527   Block = Cand.Block;
   1528   ReadyBlocks.erase(Best);
   1529   return Block;
   1530 }
   1531 
   1532 // Tracking of currently alive registers to determine VGPR Usage.
   1533 
   1534 void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
   1535   for (unsigned Reg : Regs) {
   1536     // For now only track virtual registers.
   1537     if (!TargetRegisterInfo::isVirtualRegister(Reg))
   1538       continue;
   1539     // If not already in the live set, then add it.
   1540     (void) LiveRegs.insert(Reg);
   1541   }
   1542 }
   1543 
   1544 void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
   1545                                        std::set<unsigned> &Regs) {
   1546   for (unsigned Reg : Regs) {
   1547     // For now only track virtual registers.
   1548     std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
   1549     assert (Pos != LiveRegs.end() && // Reg must be live.
   1550                LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
   1551                LiveRegsConsumers[Reg] >= 1);
   1552     --LiveRegsConsumers[Reg];
   1553     if (LiveRegsConsumers[Reg] == 0)
   1554       LiveRegs.erase(Pos);
   1555   }
   1556 }
   1557 
   1558 void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
   1559   for (SIScheduleBlock* Block : Parent->getSuccs()) {
   1560     --BlockNumPredsLeft[Block->getID()];
   1561     if (BlockNumPredsLeft[Block->getID()] == 0) {
   1562       ReadyBlocks.push_back(Block);
   1563     }
   1564     // TODO: Improve check. When the dependency between the high latency
   1565     // instructions and the instructions of the other blocks are WAR or WAW
   1566     // there will be no wait triggered. We would like these cases to not
   1567     // update LastPosHighLatencyParentScheduled.
   1568     if (Parent->isHighLatencyBlock())
   1569       LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled;
   1570   }
   1571 }
   1572 
   1573 void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
   1574   decreaseLiveRegs(Block, Block->getInRegs());
   1575   addLiveRegs(Block->getOutRegs());
   1576   releaseBlockSuccs(Block);
   1577   for (std::map<unsigned, unsigned>::iterator RegI =
   1578        LiveOutRegsNumUsages[Block->getID()].begin(),
   1579        E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
   1580     std::pair<unsigned, unsigned> RegP = *RegI;
   1581     if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end())
   1582       LiveRegsConsumers[RegP.first] = RegP.second;
   1583     else {
   1584       assert(LiveRegsConsumers[RegP.first] == 0);
   1585       LiveRegsConsumers[RegP.first] += RegP.second;
   1586     }
   1587   }
   1588   if (LastPosHighLatencyParentScheduled[Block->getID()] >
   1589         (unsigned)LastPosWaitedHighLatency)
   1590     LastPosWaitedHighLatency =
   1591       LastPosHighLatencyParentScheduled[Block->getID()];
   1592   ++NumBlockScheduled;
   1593 }
   1594 
   1595 std::vector<int>
   1596 SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
   1597                                      std::set<unsigned> &OutRegs) {
   1598   std::vector<int> DiffSetPressure;
   1599   DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
   1600 
   1601   for (unsigned Reg : InRegs) {
   1602     // For now only track virtual registers.
   1603     if (!TargetRegisterInfo::isVirtualRegister(Reg))
   1604       continue;
   1605     if (LiveRegsConsumers[Reg] > 1)
   1606       continue;
   1607     PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
   1608     for (; PSetI.isValid(); ++PSetI) {
   1609       DiffSetPressure[*PSetI] -= PSetI.getWeight();
   1610     }
   1611   }
   1612 
   1613   for (unsigned Reg : OutRegs) {
   1614     // For now only track virtual registers.
   1615     if (!TargetRegisterInfo::isVirtualRegister(Reg))
   1616       continue;
   1617     PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
   1618     for (; PSetI.isValid(); ++PSetI) {
   1619       DiffSetPressure[*PSetI] += PSetI.getWeight();
   1620     }
   1621   }
   1622 
   1623   return DiffSetPressure;
   1624 }
   1625 
   1626 // SIScheduler //
   1627 
   1628 struct SIScheduleBlockResult
   1629 SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
   1630                              SISchedulerBlockSchedulerVariant ScheduleVariant) {
   1631   SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
   1632   SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
   1633   std::vector<SIScheduleBlock*> ScheduledBlocks;
   1634   struct SIScheduleBlockResult Res;
   1635 
   1636   ScheduledBlocks = Scheduler.getBlocks();
   1637 
   1638   for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
   1639     SIScheduleBlock *Block = ScheduledBlocks[b];
   1640     std::vector<SUnit*> SUs = Block->getScheduledUnits();
   1641 
   1642     for (SUnit* SU : SUs)
   1643       Res.SUs.push_back(SU->NodeNum);
   1644   }
   1645 
   1646   Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
   1647   Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
   1648   return Res;
   1649 }
   1650 
   1651 // SIScheduleDAGMI //
   1652 
   1653 SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
   1654   ScheduleDAGMILive(C, make_unique<GenericScheduler>(C)) {
   1655   SITII = static_cast<const SIInstrInfo*>(TII);
   1656   SITRI = static_cast<const SIRegisterInfo*>(TRI);
   1657 
   1658   VGPRSetID = SITRI->getVGPR32PressureSet();
   1659   SGPRSetID = SITRI->getSGPR32PressureSet();
   1660 }
   1661 
   1662 SIScheduleDAGMI::~SIScheduleDAGMI() {
   1663 }
   1664 
   1665 ScheduleDAGInstrs *llvm::createSIMachineScheduler(MachineSchedContext *C) {
   1666   return new SIScheduleDAGMI(C);
   1667 }
   1668 
   1669 // Code adapted from scheduleDAG.cpp
   1670 // Does a topological sort over the SUs.
   1671 // Both TopDown and BottomUp
   1672 void SIScheduleDAGMI::topologicalSort() {
   1673   Topo.InitDAGTopologicalSorting();
   1674 
   1675   TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
   1676   BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
   1677 }
   1678 
   1679 // Move low latencies further from their user without
   1680 // increasing SGPR usage (in general)
   1681 // This is to be replaced by a better pass that would
   1682 // take into account SGPR usage (based on VGPR Usage
   1683 // and the corresponding wavefront count), that would
   1684 // try to merge groups of loads if it make sense, etc
   1685 void SIScheduleDAGMI::moveLowLatencies() {
   1686    unsigned DAGSize = SUnits.size();
   1687    int LastLowLatencyUser = -1;
   1688    int LastLowLatencyPos = -1;
   1689 
   1690    for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
   1691     SUnit *SU = &SUnits[ScheduledSUnits[i]];
   1692     bool IsLowLatencyUser = false;
   1693     unsigned MinPos = 0;
   1694 
   1695     for (SDep& PredDep : SU->Preds) {
   1696       SUnit *Pred = PredDep.getSUnit();
   1697       if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
   1698         IsLowLatencyUser = true;
   1699       }
   1700       if (Pred->NodeNum >= DAGSize)
   1701         continue;
   1702       unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
   1703       if (PredPos >= MinPos)
   1704         MinPos = PredPos + 1;
   1705     }
   1706 
   1707     if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
   1708       unsigned BestPos = LastLowLatencyUser + 1;
   1709       if ((int)BestPos <= LastLowLatencyPos)
   1710         BestPos = LastLowLatencyPos + 1;
   1711       if (BestPos < MinPos)
   1712         BestPos = MinPos;
   1713       if (BestPos < i) {
   1714         for (unsigned u = i; u > BestPos; --u) {
   1715           ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
   1716           ScheduledSUnits[u] = ScheduledSUnits[u-1];
   1717         }
   1718         ScheduledSUnits[BestPos] = SU->NodeNum;
   1719         ScheduledSUnitsInv[SU->NodeNum] = BestPos;
   1720       }
   1721       LastLowLatencyPos = BestPos;
   1722       if (IsLowLatencyUser)
   1723         LastLowLatencyUser = BestPos;
   1724     } else if (IsLowLatencyUser) {
   1725       LastLowLatencyUser = i;
   1726     // Moves COPY instructions on which depends
   1727     // the low latency instructions too.
   1728     } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
   1729       bool CopyForLowLat = false;
   1730       for (SDep& SuccDep : SU->Succs) {
   1731         SUnit *Succ = SuccDep.getSUnit();
   1732         if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
   1733           CopyForLowLat = true;
   1734         }
   1735       }
   1736       if (!CopyForLowLat)
   1737         continue;
   1738       if (MinPos < i) {
   1739         for (unsigned u = i; u > MinPos; --u) {
   1740           ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
   1741           ScheduledSUnits[u] = ScheduledSUnits[u-1];
   1742         }
   1743         ScheduledSUnits[MinPos] = SU->NodeNum;
   1744         ScheduledSUnitsInv[SU->NodeNum] = MinPos;
   1745       }
   1746     }
   1747   }
   1748 }
   1749 
   1750 void SIScheduleDAGMI::restoreSULinksLeft() {
   1751   for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
   1752     SUnits[i].isScheduled = false;
   1753     SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
   1754     SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
   1755     SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
   1756     SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
   1757   }
   1758 }
   1759 
   1760 // Return the Vgpr and Sgpr usage corresponding to some virtual registers.
   1761 template<typename _Iterator> void
   1762 SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
   1763                                   unsigned &VgprUsage, unsigned &SgprUsage) {
   1764   VgprUsage = 0;
   1765   SgprUsage = 0;
   1766   for (_Iterator RegI = First; RegI != End; ++RegI) {
   1767     unsigned Reg = *RegI;
   1768     // For now only track virtual registers
   1769     if (!TargetRegisterInfo::isVirtualRegister(Reg))
   1770       continue;
   1771     PSetIterator PSetI = MRI.getPressureSets(Reg);
   1772     for (; PSetI.isValid(); ++PSetI) {
   1773       if (*PSetI == VGPRSetID)
   1774         VgprUsage += PSetI.getWeight();
   1775       else if (*PSetI == SGPRSetID)
   1776         SgprUsage += PSetI.getWeight();
   1777     }
   1778   }
   1779 }
   1780 
   1781 void SIScheduleDAGMI::schedule()
   1782 {
   1783   SmallVector<SUnit*, 8> TopRoots, BotRoots;
   1784   SIScheduleBlockResult Best, Temp;
   1785   DEBUG(dbgs() << "Preparing Scheduling\n");
   1786 
   1787   buildDAGWithRegPressure();
   1788   DEBUG(
   1789     for(SUnit& SU : SUnits)
   1790        SU.dumpAll(this)
   1791   );
   1792 
   1793   topologicalSort();
   1794   findRootsAndBiasEdges(TopRoots, BotRoots);
   1795   // We reuse several ScheduleDAGMI and ScheduleDAGMILive
   1796   // functions, but to make them happy we must initialize
   1797   // the default Scheduler implementation (even if we do not
   1798   // run it)
   1799   SchedImpl->initialize(this);
   1800   initQueues(TopRoots, BotRoots);
   1801 
   1802   // Fill some stats to help scheduling.
   1803 
   1804   SUnitsLinksBackup = SUnits;
   1805   IsLowLatencySU.clear();
   1806   LowLatencyOffset.clear();
   1807   IsHighLatencySU.clear();
   1808 
   1809   IsLowLatencySU.resize(SUnits.size(), 0);
   1810   LowLatencyOffset.resize(SUnits.size(), 0);
   1811   IsHighLatencySU.resize(SUnits.size(), 0);
   1812 
   1813   for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
   1814     SUnit *SU = &SUnits[i];
   1815     unsigned BaseLatReg;
   1816     int64_t OffLatReg;
   1817     if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
   1818       IsLowLatencySU[i] = 1;
   1819       if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg,
   1820                                        TRI))
   1821         LowLatencyOffset[i] = OffLatReg;
   1822     } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
   1823       IsHighLatencySU[i] = 1;
   1824   }
   1825 
   1826   SIScheduler Scheduler(this);
   1827   Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
   1828                                    SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
   1829 
   1830   // if VGPR usage is extremely high, try other good performing variants
   1831   // which could lead to lower VGPR usage
   1832   if (Best.MaxVGPRUsage > 180) {
   1833     std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
   1834       { LatenciesAlone, BlockRegUsageLatency },
   1835 //      { LatenciesAlone, BlockRegUsage },
   1836       { LatenciesGrouped, BlockLatencyRegUsage },
   1837 //      { LatenciesGrouped, BlockRegUsageLatency },
   1838 //      { LatenciesGrouped, BlockRegUsage },
   1839       { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
   1840 //      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
   1841 //      { LatenciesAlonePlusConsecutive, BlockRegUsage }
   1842     };
   1843     for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
   1844       Temp = Scheduler.scheduleVariant(v.first, v.second);
   1845       if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
   1846         Best = Temp;
   1847     }
   1848   }
   1849   // if VGPR usage is still extremely high, we may spill. Try other variants
   1850   // which are less performing, but that could lead to lower VGPR usage.
   1851   if (Best.MaxVGPRUsage > 200) {
   1852     std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
   1853 //      { LatenciesAlone, BlockRegUsageLatency },
   1854       { LatenciesAlone, BlockRegUsage },
   1855 //      { LatenciesGrouped, BlockLatencyRegUsage },
   1856       { LatenciesGrouped, BlockRegUsageLatency },
   1857       { LatenciesGrouped, BlockRegUsage },
   1858 //      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
   1859       { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
   1860       { LatenciesAlonePlusConsecutive, BlockRegUsage }
   1861     };
   1862     for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
   1863       Temp = Scheduler.scheduleVariant(v.first, v.second);
   1864       if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
   1865         Best = Temp;
   1866     }
   1867   }
   1868 
   1869   ScheduledSUnits = Best.SUs;
   1870   ScheduledSUnitsInv.resize(SUnits.size());
   1871 
   1872   for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
   1873     ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
   1874   }
   1875 
   1876   moveLowLatencies();
   1877 
   1878   // Tell the outside world about the result of the scheduling.
   1879 
   1880   assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
   1881   TopRPTracker.setPos(CurrentTop);
   1882 
   1883   for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
   1884        E = ScheduledSUnits.end(); I != E; ++I) {
   1885     SUnit *SU = &SUnits[*I];
   1886 
   1887     scheduleMI(SU, true);
   1888 
   1889     DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
   1890                  << *SU->getInstr());
   1891   }
   1892 
   1893   assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
   1894 
   1895   placeDebugValues();
   1896 
   1897   DEBUG({
   1898       unsigned BBNum = begin()->getParent()->getNumber();
   1899       dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
   1900       dumpSchedule();
   1901       dbgs() << '\n';
   1902     });
   1903 }
   1904