Home | History | Annotate | Download | only in src
      1 //===- subzero/src/IceCfg.cpp - Control flow graph implementation ---------===//
      2 //
      3 //                        The Subzero Code Generator
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 ///
     10 /// \file
     11 /// \brief Implements the Cfg class.
     12 ///
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "IceCfg.h"
     16 
     17 #include "IceAssembler.h"
     18 #include "IceBitVector.h"
     19 #include "IceCfgNode.h"
     20 #include "IceClFlags.h"
     21 #include "IceDefs.h"
     22 #include "IceELFObjectWriter.h"
     23 #include "IceGlobalInits.h"
     24 #include "IceInst.h"
     25 #include "IceInstVarIter.h"
     26 #include "IceInstrumentation.h"
     27 #include "IceLiveness.h"
     28 #include "IceLoopAnalyzer.h"
     29 #include "IceOperand.h"
     30 #include "IceTargetLowering.h"
     31 
     32 #include <memory>
     33 #include <utility>
     34 
     35 namespace Ice {
     36 
     37 Cfg::Cfg(GlobalContext *Ctx, uint32_t SequenceNumber)
     38     : Allocator(createAllocator()), Ctx(Ctx), SequenceNumber(SequenceNumber),
     39       VMask(getFlags().getVerbose()), FunctionName(),
     40       NextInstNumber(Inst::NumberInitial), Live(nullptr) {
     41   NodeStrings.reset(new StringPool);
     42   VarStrings.reset(new StringPool);
     43   CfgLocalAllocatorScope _(this);
     44   Target = TargetLowering::createLowering(getFlags().getTargetArch(), this);
     45   VMetadata.reset(new VariablesMetadata(this));
     46   TargetAssembler = Target->createAssembler();
     47 
     48   if (getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Randomize) {
     49     // If -randomize-pool-immediates=randomize, create a random number
     50     // generator to generate a cookie for constant blinding.
     51     RandomNumberGenerator RNG(getFlags().getRandomSeed(), RPE_ConstantBlinding,
     52                               this->SequenceNumber);
     53     ConstantBlindingCookie =
     54         (uint32_t)RNG.next((uint64_t)std::numeric_limits<uint32_t>::max() + 1);
     55   }
     56 }
     57 
     58 Cfg::~Cfg() {
     59   assert(CfgAllocatorTraits::current() == nullptr);
     60   if (getFlags().getDumpStrings()) {
     61     OstreamLocker _(Ctx);
     62     Ostream &Str = Ctx->getStrDump();
     63     getNodeStrings()->dump(Str);
     64     getVarStrings()->dump(Str);
     65   }
     66 }
     67 
     68 // Called in the initalizer list of Cfg's constructor to create the Allocator
     69 // and set it as TLS before any other member fields are constructed, since they
     70 // may depend on it.
     71 ArenaAllocator *Cfg::createAllocator() {
     72   ArenaAllocator *Allocator = new ArenaAllocator();
     73   CfgAllocatorTraits::set_current(Allocator);
     74   return Allocator;
     75 }
     76 
     77 /// Create a string like "foo(i=123:b=9)" indicating the function name, number
     78 /// of high-level instructions, and number of basic blocks.  This string is only
     79 /// used for dumping and other diagnostics, and the idea is that given a set of
     80 /// functions to debug a problem on, it's easy to find the smallest or simplest
     81 /// function to attack.  Note that the counts may change somewhat depending on
     82 /// what point it is called during the translation passes.
     83 std::string Cfg::getFunctionNameAndSize() const {
     84   if (!BuildDefs::dump())
     85     return getFunctionName().toString();
     86   SizeT NodeCount = 0;
     87   SizeT InstCount = 0;
     88   for (CfgNode *Node : getNodes()) {
     89     ++NodeCount;
     90     // Note: deleted instructions are *not* ignored.
     91     InstCount += Node->getPhis().size();
     92     for (Inst &I : Node->getInsts()) {
     93       if (!llvm::isa<InstTarget>(&I))
     94         ++InstCount;
     95     }
     96   }
     97   return getFunctionName() + "(i=" + std::to_string(InstCount) + ":b=" +
     98          std::to_string(NodeCount) + ")";
     99 }
    100 
    101 void Cfg::setError(const std::string &Message) {
    102   HasError = true;
    103   ErrorMessage = Message;
    104 }
    105 
    106 CfgNode *Cfg::makeNode() {
    107   SizeT LabelIndex = Nodes.size();
    108   auto *Node = CfgNode::create(this, LabelIndex);
    109   Nodes.push_back(Node);
    110   return Node;
    111 }
    112 
    113 void Cfg::swapNodes(NodeList &NewNodes) {
    114   assert(Nodes.size() == NewNodes.size());
    115   Nodes.swap(NewNodes);
    116   for (SizeT I = 0, NumNodes = getNumNodes(); I < NumNodes; ++I)
    117     Nodes[I]->resetIndex(I);
    118 }
    119 
    120 template <> Variable *Cfg::makeVariable<Variable>(Type Ty) {
    121   SizeT Index = Variables.size();
    122   Variable *Var;
    123   if (Target->shouldSplitToVariableVecOn32(Ty)) {
    124     Var = VariableVecOn32::create(this, Ty, Index);
    125   } else if (Target->shouldSplitToVariable64On32(Ty)) {
    126     Var = Variable64On32::create(this, Ty, Index);
    127   } else {
    128     Var = Variable::create(this, Ty, Index);
    129   }
    130   Variables.push_back(Var);
    131   return Var;
    132 }
    133 
    134 void Cfg::addArg(Variable *Arg) {
    135   Arg->setIsArg();
    136   Args.push_back(Arg);
    137 }
    138 
    139 void Cfg::addImplicitArg(Variable *Arg) {
    140   Arg->setIsImplicitArg();
    141   ImplicitArgs.push_back(Arg);
    142 }
    143 
    144 // Returns whether the stack frame layout has been computed yet. This is used
    145 // for dumping the stack frame location of Variables.
    146 bool Cfg::hasComputedFrame() const { return getTarget()->hasComputedFrame(); }
    147 
    148 namespace {
    149 constexpr char BlockNameGlobalPrefix[] = ".L$profiler$block_name$";
    150 constexpr char BlockStatsGlobalPrefix[] = ".L$profiler$block_info$";
    151 } // end of anonymous namespace
    152 
    153 void Cfg::createNodeNameDeclaration(const std::string &NodeAsmName) {
    154   auto *Var = VariableDeclaration::create(GlobalInits.get());
    155   Var->setName(Ctx, BlockNameGlobalPrefix + NodeAsmName);
    156   Var->setIsConstant(true);
    157   Var->addInitializer(VariableDeclaration::DataInitializer::create(
    158       GlobalInits.get(), NodeAsmName.data(), NodeAsmName.size() + 1));
    159   const SizeT Int64ByteSize = typeWidthInBytes(IceType_i64);
    160   Var->setAlignment(Int64ByteSize); // Wasteful, 32-bit could use 4 bytes.
    161   GlobalInits->push_back(Var);
    162 }
    163 
    164 void Cfg::createBlockProfilingInfoDeclaration(
    165     const std::string &NodeAsmName, VariableDeclaration *NodeNameDeclaration) {
    166   auto *Var = VariableDeclaration::create(GlobalInits.get());
    167   Var->setName(Ctx, BlockStatsGlobalPrefix + NodeAsmName);
    168   const SizeT Int64ByteSize = typeWidthInBytes(IceType_i64);
    169   Var->addInitializer(VariableDeclaration::ZeroInitializer::create(
    170       GlobalInits.get(), Int64ByteSize));
    171 
    172   const RelocOffsetT NodeNameDeclarationOffset = 0;
    173   Var->addInitializer(VariableDeclaration::RelocInitializer::create(
    174       GlobalInits.get(), NodeNameDeclaration,
    175       {RelocOffset::create(Ctx, NodeNameDeclarationOffset)}));
    176   Var->setAlignment(Int64ByteSize);
    177   GlobalInits->push_back(Var);
    178 }
    179 
    180 void Cfg::profileBlocks() {
    181   if (GlobalInits == nullptr)
    182     GlobalInits.reset(new VariableDeclarationList());
    183 
    184   for (CfgNode *Node : Nodes) {
    185     const std::string NodeAsmName = Node->getAsmName();
    186     createNodeNameDeclaration(NodeAsmName);
    187     createBlockProfilingInfoDeclaration(NodeAsmName, GlobalInits->back());
    188     Node->profileExecutionCount(GlobalInits->back());
    189   }
    190 }
    191 
    192 bool Cfg::isProfileGlobal(const VariableDeclaration &Var) {
    193   if (!Var.getName().hasStdString())
    194     return false;
    195   return Var.getName().toString().find(BlockStatsGlobalPrefix) == 0;
    196 }
    197 
    198 void Cfg::addCallToProfileSummary() {
    199   // The call(s) to __Sz_profile_summary are added by the profiler in functions
    200   // that cause the program to exit. This function is defined in
    201   // runtime/szrt_profiler.c.
    202   Constant *ProfileSummarySym =
    203       Ctx->getConstantExternSym(Ctx->getGlobalString("__Sz_profile_summary"));
    204   constexpr SizeT NumArgs = 0;
    205   constexpr Variable *Void = nullptr;
    206   constexpr bool HasTailCall = false;
    207   auto *Call =
    208       InstCall::create(this, NumArgs, Void, ProfileSummarySym, HasTailCall);
    209   getEntryNode()->getInsts().push_front(Call);
    210 }
    211 
    212 void Cfg::translate() {
    213   if (hasError())
    214     return;
    215   // Cache the possibly-overridden optimization level once translation begins.
    216   // It would be nicer to do this in the constructor, but we need to wait until
    217   // after setFunctionName() has a chance to be called.
    218   OptimizationLevel =
    219       getFlags().matchForceO2(getFunctionName(), getSequenceNumber())
    220           ? Opt_2
    221           : getFlags().getOptLevel();
    222   if (BuildDefs::timers()) {
    223     if (getFlags().matchTimingFocus(getFunctionName(), getSequenceNumber())) {
    224       setFocusedTiming();
    225       getContext()->resetTimer(GlobalContext::TSK_Default);
    226     }
    227   }
    228   if (BuildDefs::dump()) {
    229     if (isVerbose(IceV_Status) &&
    230         getFlags().matchTestStatus(getFunctionName(), getSequenceNumber())) {
    231       getContext()->getStrDump() << ">>>Translating "
    232                                  << getFunctionNameAndSize()
    233                                  << " seq=" << getSequenceNumber() << "\n";
    234     }
    235   }
    236   TimerMarker T_func(getContext(), getFunctionName().toStringOrEmpty());
    237   TimerMarker T(TimerStack::TT_translate, this);
    238 
    239   dump("Initial CFG");
    240 
    241   if (getFlags().getEnableBlockProfile()) {
    242     profileBlocks();
    243     // TODO(jpp): this is fragile, at best. Figure out a better way of
    244     // detecting exit functions.
    245     if (getFunctionName().toStringOrEmpty() == "exit") {
    246       addCallToProfileSummary();
    247     }
    248     dump("Profiled CFG");
    249   }
    250 
    251   // Create the Hi and Lo variables where a split was needed
    252   for (Variable *Var : Variables) {
    253     if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Var)) {
    254       Var64On32->initHiLo(this);
    255     } else if (auto *VarVecOn32 = llvm::dyn_cast<VariableVecOn32>(Var)) {
    256       VarVecOn32->initVecElement(this);
    257     }
    258   }
    259 
    260   // Instrument the Cfg, e.g. with AddressSanitizer
    261   if (!BuildDefs::minimal() && getFlags().getSanitizeAddresses()) {
    262     getContext()->instrumentFunc(this);
    263     dump("Instrumented CFG");
    264   }
    265 
    266   // The set of translation passes and their order are determined by the
    267   // target.
    268   getTarget()->translate();
    269 
    270   dump("Final output");
    271   if (getFocusedTiming()) {
    272     getContext()->dumpLocalTimers(getFunctionName().toString());
    273   }
    274 }
    275 
    276 void Cfg::fixPhiNodes() {
    277   for (auto *Node : Nodes) {
    278     // Fix all the phi edges since WASM can't tell how to make them correctly at
    279     // the beginning.
    280     assert(Node);
    281     const auto &InEdges = Node->getInEdges();
    282     for (auto &Instr : Node->getPhis()) {
    283       auto *Phi = llvm::cast<InstPhi>(&Instr);
    284       assert(Phi);
    285       for (SizeT i = 0; i < InEdges.size(); ++i) {
    286         Phi->setLabel(i, InEdges[i]);
    287       }
    288     }
    289   }
    290 }
    291 
    292 void Cfg::computeInOutEdges() {
    293   // Compute the out-edges.
    294   for (CfgNode *Node : Nodes) {
    295     Node->computeSuccessors();
    296   }
    297 
    298   // Prune any unreachable nodes before computing in-edges.
    299   SizeT NumNodes = getNumNodes();
    300   BitVector Reachable(NumNodes);
    301   BitVector Pending(NumNodes);
    302   Pending.set(getEntryNode()->getIndex());
    303   while (true) {
    304     int Index = Pending.find_first();
    305     if (Index == -1)
    306       break;
    307     Pending.reset(Index);
    308     Reachable.set(Index);
    309     CfgNode *Node = Nodes[Index];
    310     assert(Node->getIndex() == (SizeT)Index);
    311     for (CfgNode *Succ : Node->getOutEdges()) {
    312       SizeT SuccIndex = Succ->getIndex();
    313       if (!Reachable.test(SuccIndex))
    314         Pending.set(SuccIndex);
    315     }
    316   }
    317   SizeT Dest = 0;
    318   for (SizeT Source = 0; Source < NumNodes; ++Source) {
    319     if (Reachable.test(Source)) {
    320       Nodes[Dest] = Nodes[Source];
    321       Nodes[Dest]->resetIndex(Dest);
    322       // Compute the in-edges.
    323       Nodes[Dest]->computePredecessors();
    324       ++Dest;
    325     }
    326   }
    327   Nodes.resize(Dest);
    328 
    329   TimerMarker T(TimerStack::TT_phiValidation, this);
    330   for (CfgNode *Node : Nodes)
    331     Node->enforcePhiConsistency();
    332 }
    333 
    334 void Cfg::renumberInstructions() {
    335   TimerMarker T(TimerStack::TT_renumberInstructions, this);
    336   NextInstNumber = Inst::NumberInitial;
    337   for (CfgNode *Node : Nodes)
    338     Node->renumberInstructions();
    339   // Make sure the entry node is the first node and therefore got the lowest
    340   // instruction numbers, to facilitate live range computation of function
    341   // arguments.  We want to model function arguments as being live on entry to
    342   // the function, otherwise an argument whose only use is in the first
    343   // instruction will be assigned a trivial live range and the register
    344   // allocator will not recognize its live range as overlapping another
    345   // variable's live range.
    346   assert(Nodes.empty() || (*Nodes.begin() == getEntryNode()));
    347 }
    348 
    349 // placePhiLoads() must be called before placePhiStores().
    350 void Cfg::placePhiLoads() {
    351   TimerMarker T(TimerStack::TT_placePhiLoads, this);
    352   for (CfgNode *Node : Nodes)
    353     Node->placePhiLoads();
    354 }
    355 
    356 // placePhiStores() must be called after placePhiLoads().
    357 void Cfg::placePhiStores() {
    358   TimerMarker T(TimerStack::TT_placePhiStores, this);
    359   for (CfgNode *Node : Nodes)
    360     Node->placePhiStores();
    361 }
    362 
    363 void Cfg::deletePhis() {
    364   TimerMarker T(TimerStack::TT_deletePhis, this);
    365   for (CfgNode *Node : Nodes)
    366     Node->deletePhis();
    367 }
    368 
    369 void Cfg::advancedPhiLowering() {
    370   TimerMarker T(TimerStack::TT_advancedPhiLowering, this);
    371   // Clear all previously computed live ranges (but not live-in/live-out bit
    372   // vectors or last-use markers), because the follow-on register allocation is
    373   // only concerned with live ranges across the newly created blocks.
    374   for (Variable *Var : Variables) {
    375     Var->getLiveRange().reset();
    376   }
    377   // This splits edges and appends new nodes to the end of the node list. This
    378   // can invalidate iterators, so don't use an iterator.
    379   SizeT NumNodes = getNumNodes();
    380   SizeT NumVars = getNumVariables();
    381   for (SizeT I = 0; I < NumNodes; ++I)
    382     Nodes[I]->advancedPhiLowering();
    383 
    384   TimerMarker TT(TimerStack::TT_lowerPhiAssignments, this);
    385   if (true) {
    386     // The following code does an in-place update of liveness and live ranges
    387     // as a result of adding the new phi edge split nodes.
    388     getLiveness()->initPhiEdgeSplits(Nodes.begin() + NumNodes,
    389                                      Variables.begin() + NumVars);
    390     TimerMarker TTT(TimerStack::TT_liveness, this);
    391     // Iterate over the newly added nodes to add their liveness info.
    392     for (auto I = Nodes.begin() + NumNodes, E = Nodes.end(); I != E; ++I) {
    393       InstNumberT FirstInstNum = getNextInstNumber();
    394       (*I)->renumberInstructions();
    395       InstNumberT LastInstNum = getNextInstNumber() - 1;
    396       (*I)->liveness(getLiveness());
    397       (*I)->livenessAddIntervals(getLiveness(), FirstInstNum, LastInstNum);
    398     }
    399   } else {
    400     // The following code does a brute-force recalculation of live ranges as a
    401     // result of adding the new phi edge split nodes. The liveness calculation
    402     // is particularly expensive because the new nodes are not yet in a proper
    403     // topological order and so convergence is slower.
    404     //
    405     // This code is kept here for reference and can be temporarily enabled in
    406     // case the incremental code is under suspicion.
    407     renumberInstructions();
    408     liveness(Liveness_Intervals);
    409     getVMetadata()->init(VMK_All);
    410   }
    411   Target->regAlloc(RAK_Phi);
    412 }
    413 
    414 // Find a reasonable placement for nodes that have not yet been placed, while
    415 // maintaining the same relative ordering among already placed nodes.
    416 void Cfg::reorderNodes() {
    417   // TODO(ascull): it would be nice if the switch tests were always followed by
    418   // the default case to allow for fall through.
    419   using PlacedList = CfgList<CfgNode *>;
    420   PlacedList Placed;      // Nodes with relative placement locked down
    421   PlacedList Unreachable; // Unreachable nodes
    422   PlacedList::iterator NoPlace = Placed.end();
    423   // Keep track of where each node has been tentatively placed so that we can
    424   // manage insertions into the middle.
    425   CfgVector<PlacedList::iterator> PlaceIndex(Nodes.size(), NoPlace);
    426   for (CfgNode *Node : Nodes) {
    427     // The "do ... while(0);" construct is to factor out the --PlaceIndex and
    428     // assert() statements before moving to the next node.
    429     do {
    430       if (Node != getEntryNode() && Node->getInEdges().empty()) {
    431         // The node has essentially been deleted since it is not a successor of
    432         // any other node.
    433         Unreachable.push_back(Node);
    434         PlaceIndex[Node->getIndex()] = Unreachable.end();
    435         Node->setNeedsPlacement(false);
    436         continue;
    437       }
    438       if (!Node->needsPlacement()) {
    439         // Add to the end of the Placed list.
    440         Placed.push_back(Node);
    441         PlaceIndex[Node->getIndex()] = Placed.end();
    442         continue;
    443       }
    444       Node->setNeedsPlacement(false);
    445       // Assume for now that the unplaced node is from edge-splitting and
    446       // therefore has 1 in-edge and 1 out-edge (actually, possibly more than 1
    447       // in-edge if the predecessor node was contracted). If this changes in
    448       // the future, rethink the strategy.
    449       assert(Node->getInEdges().size() >= 1);
    450       assert(Node->hasSingleOutEdge());
    451 
    452       // If it's a (non-critical) edge where the successor has a single
    453       // in-edge, then place it before the successor.
    454       CfgNode *Succ = Node->getOutEdges().front();
    455       if (Succ->getInEdges().size() == 1 &&
    456           PlaceIndex[Succ->getIndex()] != NoPlace) {
    457         Placed.insert(PlaceIndex[Succ->getIndex()], Node);
    458         PlaceIndex[Node->getIndex()] = PlaceIndex[Succ->getIndex()];
    459         continue;
    460       }
    461 
    462       // Otherwise, place it after the (first) predecessor.
    463       CfgNode *Pred = Node->getInEdges().front();
    464       auto PredPosition = PlaceIndex[Pred->getIndex()];
    465       // It shouldn't be the case that PredPosition==NoPlace, but if that
    466       // somehow turns out to be true, we just insert Node before
    467       // PredPosition=NoPlace=Placed.end() .
    468       if (PredPosition != NoPlace)
    469         ++PredPosition;
    470       Placed.insert(PredPosition, Node);
    471       PlaceIndex[Node->getIndex()] = PredPosition;
    472     } while (0);
    473 
    474     --PlaceIndex[Node->getIndex()];
    475     assert(*PlaceIndex[Node->getIndex()] == Node);
    476   }
    477 
    478   // Reorder Nodes according to the built-up lists.
    479   NodeList Reordered;
    480   Reordered.reserve(Placed.size() + Unreachable.size());
    481   for (CfgNode *Node : Placed)
    482     Reordered.push_back(Node);
    483   for (CfgNode *Node : Unreachable)
    484     Reordered.push_back(Node);
    485   assert(getNumNodes() == Reordered.size());
    486   swapNodes(Reordered);
    487 }
    488 
    489 namespace {
    490 void getRandomPostOrder(CfgNode *Node, BitVector &ToVisit,
    491                         Ice::NodeList &PostOrder,
    492                         Ice::RandomNumberGenerator *RNG) {
    493   assert(ToVisit[Node->getIndex()]);
    494   ToVisit[Node->getIndex()] = false;
    495   NodeList Outs = Node->getOutEdges();
    496   Ice::RandomShuffle(Outs.begin(), Outs.end(),
    497                      [RNG](int N) { return RNG->next(N); });
    498   for (CfgNode *Next : Outs) {
    499     if (ToVisit[Next->getIndex()])
    500       getRandomPostOrder(Next, ToVisit, PostOrder, RNG);
    501   }
    502   PostOrder.push_back(Node);
    503 }
    504 } // end of anonymous namespace
    505 
    506 void Cfg::shuffleNodes() {
    507   if (!getFlags().getReorderBasicBlocks())
    508     return;
    509 
    510   NodeList ReversedReachable;
    511   NodeList Unreachable;
    512   BitVector ToVisit(Nodes.size(), true);
    513   // Create Random number generator for function reordering
    514   RandomNumberGenerator RNG(getFlags().getRandomSeed(),
    515                             RPE_BasicBlockReordering, SequenceNumber);
    516   // Traverse from entry node.
    517   getRandomPostOrder(getEntryNode(), ToVisit, ReversedReachable, &RNG);
    518   // Collect the unreachable nodes.
    519   for (CfgNode *Node : Nodes)
    520     if (ToVisit[Node->getIndex()])
    521       Unreachable.push_back(Node);
    522   // Copy the layout list to the Nodes.
    523   NodeList Shuffled;
    524   Shuffled.reserve(ReversedReachable.size() + Unreachable.size());
    525   for (CfgNode *Node : reverse_range(ReversedReachable))
    526     Shuffled.push_back(Node);
    527   for (CfgNode *Node : Unreachable)
    528     Shuffled.push_back(Node);
    529   assert(Nodes.size() == Shuffled.size());
    530   swapNodes(Shuffled);
    531 
    532   dump("After basic block shuffling");
    533 }
    534 
    535 void Cfg::localCSE(bool AssumeSSA) {
    536   // Performs basic-block local common-subexpression elimination
    537   // If we have
    538   // t1 = op b c
    539   // t2 = op b c
    540   // This pass will replace future references to t2 in a basic block by t1
    541   // Points to note:
    542   // 1. Assumes SSA by default. To change this, use -lcse=no-ssa
    543   //      This is needed if this pass is moved to a point later in the pipeline.
    544   //      If variables have a single definition (in the node), CSE can work just
    545   //      on the basis of an equality compare on instructions (sans Dest). When
    546   //      variables can be updated (hence, non-SSA) the result of a previous
    547   //      instruction which used that variable as an operand can not be reused.
    548   // 2. Leaves removal of instructions to DCE.
    549   // 3. Only enabled on arithmetic instructions. pnacl-clang (-O2) is expected
    550   //    to take care of cases not arising from GEP simplification.
    551   // 4. By default, a single pass is made over each basic block. Control this
    552   //    with -lcse-max-iters=N
    553 
    554   TimerMarker T(TimerStack::TT_localCse, this);
    555   struct VariableHash {
    556     size_t operator()(const Variable *Var) const { return Var->hashValue(); }
    557   };
    558 
    559   struct InstHash {
    560     size_t operator()(const Inst *Instr) const {
    561       auto Kind = Instr->getKind();
    562       auto Result =
    563           std::hash<typename std::underlying_type<Inst::InstKind>::type>()(
    564               Kind);
    565       for (SizeT i = 0; i < Instr->getSrcSize(); ++i) {
    566         Result ^= Instr->getSrc(i)->hashValue();
    567       }
    568       return Result;
    569     }
    570   };
    571   struct InstEq {
    572     bool srcEq(const Operand *A, const Operand *B) const {
    573       if (llvm::isa<Variable>(A) || llvm::isa<Constant>(A))
    574         return (A == B);
    575       return false;
    576     }
    577     bool operator()(const Inst *InstrA, const Inst *InstrB) const {
    578       if ((InstrA->getKind() != InstrB->getKind()) ||
    579           (InstrA->getSrcSize() != InstrB->getSrcSize()))
    580         return false;
    581 
    582       if (auto *A = llvm::dyn_cast<InstArithmetic>(InstrA)) {
    583         auto *B = llvm::cast<InstArithmetic>(InstrB);
    584         // A, B are guaranteed to be of the same 'kind' at this point
    585         // So, dyn_cast is not needed
    586         if (A->getOp() != B->getOp())
    587           return false;
    588       }
    589       // Does not enter loop if different kind or number of operands
    590       for (SizeT i = 0; i < InstrA->getSrcSize(); ++i) {
    591         if (!srcEq(InstrA->getSrc(i), InstrB->getSrc(i)))
    592           return false;
    593       }
    594       return true;
    595     }
    596   };
    597 
    598   for (CfgNode *Node : getNodes()) {
    599     CfgUnorderedSet<Inst *, InstHash, InstEq> Seen;
    600     // Stores currently available instructions.
    601 
    602     CfgUnorderedMap<Variable *, Variable *, VariableHash> Replacements;
    603     // Combining the above two into a single data structure might consume less
    604     // memory but will be slower i.e map of Instruction -> Set of Variables
    605 
    606     CfgUnorderedMap<Variable *, std::vector<Inst *>, VariableHash> Dependency;
    607     // Maps a variable to the Instructions that depend on it.
    608     // a = op1 b c
    609     // x = op2 c d
    610     // Will result in the map : b -> {a}, c -> {a, x}, d -> {x}
    611     // Not necessary for SSA as dependencies will never be invalidated, and the
    612     // container will use minimal memory when left unused.
    613 
    614     auto IterCount = getFlags().getLocalCseMaxIterations();
    615 
    616     for (uint32_t i = 0; i < IterCount; ++i) {
    617       // TODO(manasijm): Stats on IterCount -> performance
    618       for (Inst &Instr : Node->getInsts()) {
    619         if (Instr.isDeleted() || !llvm::isa<InstArithmetic>(&Instr))
    620           continue;
    621         if (!AssumeSSA) {
    622           // Invalidate replacements
    623           auto Iter = Replacements.find(Instr.getDest());
    624           if (Iter != Replacements.end()) {
    625             Replacements.erase(Iter);
    626           }
    627 
    628           // Invalidate 'seen' instructions whose operands were just updated.
    629           auto DepIter = Dependency.find(Instr.getDest());
    630           if (DepIter != Dependency.end()) {
    631             for (auto *DepInst : DepIter->second) {
    632               Seen.erase(DepInst);
    633             }
    634           }
    635         }
    636 
    637         // Replace - doing this before checking for repetitions might enable
    638         // more optimizations
    639         for (SizeT i = 0; i < Instr.getSrcSize(); ++i) {
    640           auto *Opnd = Instr.getSrc(i);
    641           if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) {
    642             if (Replacements.find(Var) != Replacements.end()) {
    643               Instr.replaceSource(i, Replacements[Var]);
    644             }
    645           }
    646         }
    647 
    648         // Check for repetitions
    649         auto SeenIter = Seen.find(&Instr);
    650         if (SeenIter != Seen.end()) { // seen before
    651           const Inst *Found = *SeenIter;
    652           Replacements[Instr.getDest()] = Found->getDest();
    653         } else { // new
    654           Seen.insert(&Instr);
    655 
    656           if (!AssumeSSA) {
    657             // Update dependencies
    658             for (SizeT i = 0; i < Instr.getSrcSize(); ++i) {
    659               auto *Opnd = Instr.getSrc(i);
    660               if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) {
    661                 Dependency[Var].push_back(&Instr);
    662               }
    663             }
    664           }
    665         }
    666       }
    667     }
    668   }
    669 }
    670 
    671 void Cfg::loopInvariantCodeMotion() {
    672   TimerMarker T(TimerStack::TT_loopInvariantCodeMotion, this);
    673   // Does not introduce new nodes as of now.
    674   for (auto &Loop : LoopInfo) {
    675     CfgNode *Header = Loop.Header;
    676     assert(Header);
    677     if (Header->getLoopNestDepth() < 1)
    678       return;
    679     CfgNode *PreHeader = Loop.PreHeader;
    680     if (PreHeader == nullptr || PreHeader->getInsts().size() == 0) {
    681       return; // try next loop
    682     }
    683 
    684     auto &Insts = PreHeader->getInsts();
    685     auto &LastInst = Insts.back();
    686     Insts.pop_back();
    687 
    688     for (auto *Inst : findLoopInvariantInstructions(Loop.Body)) {
    689       PreHeader->appendInst(Inst);
    690     }
    691     PreHeader->appendInst(&LastInst);
    692   }
    693 }
    694 
    695 CfgVector<Inst *>
    696 Cfg::findLoopInvariantInstructions(const CfgUnorderedSet<SizeT> &Body) {
    697   CfgUnorderedSet<Inst *> InvariantInsts;
    698   CfgUnorderedSet<Variable *> InvariantVars;
    699   for (auto *Var : getArgs()) {
    700     InvariantVars.insert(Var);
    701   }
    702   bool Changed = false;
    703   do {
    704     Changed = false;
    705     for (auto NodeIndex : Body) {
    706       auto *Node = Nodes[NodeIndex];
    707       CfgVector<std::reference_wrapper<Inst>> Insts(Node->getInsts().begin(),
    708                                                     Node->getInsts().end());
    709 
    710       for (auto &InstRef : Insts) {
    711         auto &Inst = InstRef.get();
    712         if (Inst.isDeleted() ||
    713             InvariantInsts.find(&Inst) != InvariantInsts.end())
    714           continue;
    715         switch (Inst.getKind()) {
    716         case Inst::InstKind::Alloca:
    717         case Inst::InstKind::Br:
    718         case Inst::InstKind::Ret:
    719         case Inst::InstKind::Phi:
    720         case Inst::InstKind::Call:
    721         case Inst::InstKind::IntrinsicCall:
    722         case Inst::InstKind::Load:
    723         case Inst::InstKind::Store:
    724         case Inst::InstKind::Switch:
    725           continue;
    726         default:
    727           break;
    728         }
    729 
    730         bool IsInvariant = true;
    731         for (SizeT i = 0; i < Inst.getSrcSize(); ++i) {
    732           if (auto *Var = llvm::dyn_cast<Variable>(Inst.getSrc(i))) {
    733             if (InvariantVars.find(Var) == InvariantVars.end()) {
    734               IsInvariant = false;
    735             }
    736           }
    737         }
    738         if (IsInvariant) {
    739           Changed = true;
    740           InvariantInsts.insert(&Inst);
    741           Node->getInsts().remove(Inst);
    742           if (Inst.getDest() != nullptr) {
    743             InvariantVars.insert(Inst.getDest());
    744           }
    745         }
    746       }
    747     }
    748   } while (Changed);
    749 
    750   CfgVector<Inst *> InstVector(InvariantInsts.begin(), InvariantInsts.end());
    751   std::sort(InstVector.begin(), InstVector.end(),
    752             [](Inst *A, Inst *B) { return A->getNumber() < B->getNumber(); });
    753   return InstVector;
    754 }
    755 
    756 void Cfg::shortCircuitJumps() {
    757   // Split Nodes whenever an early jump is possible.
    758   // __N :
    759   //   a = <something>
    760   //   Instruction 1 without side effect
    761   //   ... b = <something> ...
    762   //   Instruction N without side effect
    763   //   t1 = or a b
    764   //   br t1 __X __Y
    765   //
    766   // is transformed into:
    767   // __N :
    768   //   a = <something>
    769   //   br a __X __N_ext
    770   //
    771   // __N_ext :
    772   //   Instruction 1 without side effect
    773   //   ... b = <something> ...
    774   //   Instruction N without side effect
    775   //   br b __X __Y
    776   // (Similar logic for AND, jump to false instead of true target.)
    777 
    778   TimerMarker T(TimerStack::TT_shortCircuit, this);
    779   getVMetadata()->init(VMK_Uses);
    780   auto NodeStack = this->getNodes();
    781   CfgUnorderedMap<SizeT, CfgVector<CfgNode *>> Splits;
    782   while (!NodeStack.empty()) {
    783     auto *Node = NodeStack.back();
    784     NodeStack.pop_back();
    785     auto NewNode = Node->shortCircuit();
    786     if (NewNode) {
    787       NodeStack.push_back(NewNode);
    788       NodeStack.push_back(Node);
    789       Splits[Node->getIndex()].push_back(NewNode);
    790     }
    791   }
    792 
    793   // Insert nodes in the right place
    794   NodeList NewList;
    795   NewList.reserve(Nodes.size());
    796   CfgUnorderedSet<SizeT> Inserted;
    797   for (auto *Node : Nodes) {
    798     if (Inserted.find(Node->getIndex()) != Inserted.end())
    799       continue; // already inserted
    800     NodeList Stack{Node};
    801     while (!Stack.empty()) {
    802       auto *Current = Stack.back();
    803       Stack.pop_back();
    804       Inserted.insert(Current->getIndex());
    805       NewList.push_back(Current);
    806       for (auto *Next : Splits[Current->getIndex()]) {
    807         Stack.push_back(Next);
    808       }
    809     }
    810   }
    811 
    812   SizeT NodeIndex = 0;
    813   for (auto *Node : NewList) {
    814     Node->resetIndex(NodeIndex++);
    815   }
    816   Nodes = NewList;
    817 }
    818 
    819 void Cfg::floatConstantCSE() {
    820   // Load multiple uses of a floating point constant (between two call
    821   // instructions or block start/end) into a variable before its first use.
    822   //   t1 = b + 1.0
    823   //   t2 = c + 1.0
    824   // Gets transformed to:
    825   //   t0 = 1.0
    826   //   t0_1 = t0
    827   //   t1 = b + t0_1
    828   //   t2 = c + t0_1
    829   // Call instructions reset the procedure, but use the same variable, just in
    830   // case it got a register. We are assuming floating point registers are not
    831   // callee saved in general. Example, continuing from before:
    832   //   result = call <some function>
    833   //   t3 = d + 1.0
    834   // Gets transformed to:
    835   //   result = call <some function>
    836   //   t0_2 = t0
    837   //   t3 = d + t0_2
    838   // TODO(manasijm, stichnot): Figure out how to 'link' t0 to the stack slot of
    839   // 1.0. When t0 does not get a register, introducing an extra assignment
    840   // statement does not make sense. The relevant portion is marked below.
    841 
    842   TimerMarker _(TimerStack::TT_floatConstantCse, this);
    843   for (CfgNode *Node : getNodes()) {
    844 
    845     CfgUnorderedMap<Constant *, Variable *> ConstCache;
    846     auto Current = Node->getInsts().begin();
    847     auto End = Node->getInsts().end();
    848     while (Current != End) {
    849       CfgUnorderedMap<Constant *, CfgVector<InstList::iterator>> FloatUses;
    850       if (llvm::isa<InstCall>(iteratorToInst(Current))) {
    851         ++Current;
    852         assert(Current != End);
    853         // Block should not end with a call
    854       }
    855       while (Current != End && !llvm::isa<InstCall>(iteratorToInst(Current))) {
    856         if (!Current->isDeleted()) {
    857           for (SizeT i = 0; i < Current->getSrcSize(); ++i) {
    858             if (auto *Const = llvm::dyn_cast<Constant>(Current->getSrc(i))) {
    859               if (Const->getType() == IceType_f32 ||
    860                   Const->getType() == IceType_f64) {
    861                 FloatUses[Const].push_back(Current);
    862               }
    863             }
    864           }
    865         }
    866         ++Current;
    867       }
    868       for (auto &Pair : FloatUses) {
    869         static constexpr SizeT MinUseThreshold = 3;
    870         if (Pair.second.size() < MinUseThreshold)
    871           continue;
    872         // Only consider constants with at least `MinUseThreshold` uses
    873         auto &Insts = Node->getInsts();
    874 
    875         if (ConstCache.find(Pair.first) == ConstCache.end()) {
    876           // Saw a constant (which is used at least twice) for the first time
    877           auto *NewVar = makeVariable(Pair.first->getType());
    878           // NewVar->setLinkedTo(Pair.first);
    879           // TODO(manasijm): Plumbing for linking to an Operand.
    880           auto *Assign = InstAssign::create(Node->getCfg(), NewVar, Pair.first);
    881           Insts.insert(Pair.second[0], Assign);
    882           ConstCache[Pair.first] = NewVar;
    883         }
    884 
    885         auto *NewVar = makeVariable(Pair.first->getType());
    886         NewVar->setLinkedTo(ConstCache[Pair.first]);
    887         auto *Assign =
    888             InstAssign::create(Node->getCfg(), NewVar, ConstCache[Pair.first]);
    889 
    890         Insts.insert(Pair.second[0], Assign);
    891         for (auto InstUse : Pair.second) {
    892           for (SizeT i = 0; i < InstUse->getSrcSize(); ++i) {
    893             if (auto *Const = llvm::dyn_cast<Constant>(InstUse->getSrc(i))) {
    894               if (Const == Pair.first) {
    895                 InstUse->replaceSource(i, NewVar);
    896               }
    897             }
    898           }
    899         }
    900       }
    901     }
    902   }
    903 }
    904 
    905 void Cfg::doArgLowering() {
    906   TimerMarker T(TimerStack::TT_doArgLowering, this);
    907   getTarget()->lowerArguments();
    908 }
    909 
    910 void Cfg::sortAndCombineAllocas(CfgVector<InstAlloca *> &Allocas,
    911                                 uint32_t CombinedAlignment, InstList &Insts,
    912                                 AllocaBaseVariableType BaseVariableType) {
    913   if (Allocas.empty())
    914     return;
    915   // Sort by decreasing alignment.
    916   std::sort(Allocas.begin(), Allocas.end(), [](InstAlloca *A1, InstAlloca *A2) {
    917     uint32_t Align1 = A1->getAlignInBytes();
    918     uint32_t Align2 = A2->getAlignInBytes();
    919     if (Align1 == Align2)
    920       return A1->getNumber() < A2->getNumber();
    921     else
    922       return Align1 > Align2;
    923   });
    924   // Process the allocas in order of decreasing stack alignment.  This allows
    925   // us to pack less-aligned pieces after more-aligned ones, resulting in less
    926   // stack growth.  It also allows there to be at most one stack alignment "and"
    927   // instruction for a whole list of allocas.
    928   uint32_t CurrentOffset = 0;
    929   CfgVector<int32_t> Offsets;
    930   for (Inst *Instr : Allocas) {
    931     auto *Alloca = llvm::cast<InstAlloca>(Instr);
    932     // Adjust the size of the allocation up to the next multiple of the
    933     // object's alignment.
    934     uint32_t Alignment = std::max(Alloca->getAlignInBytes(), 1u);
    935     auto *ConstSize =
    936         llvm::dyn_cast<ConstantInteger32>(Alloca->getSizeInBytes());
    937     uint32_t Size = Utils::applyAlignment(ConstSize->getValue(), Alignment);
    938     if (BaseVariableType == BVT_FramePointer) {
    939       // Addressing is relative to the frame pointer.  Subtract the offset after
    940       // adding the size of the alloca, because it grows downwards from the
    941       // frame pointer.
    942       Offsets.push_back(Target->getFramePointerOffset(CurrentOffset, Size));
    943     } else {
    944       // Addressing is relative to the stack pointer or to a user pointer.  Add
    945       // the offset before adding the size of the object, because it grows
    946       // upwards from the stack pointer. In addition, if the addressing is
    947       // relative to the stack pointer, we need to add the pre-computed max out
    948       // args size bytes.
    949       const uint32_t OutArgsOffsetOrZero =
    950           (BaseVariableType == BVT_StackPointer)
    951               ? getTarget()->maxOutArgsSizeBytes()
    952               : 0;
    953       Offsets.push_back(CurrentOffset + OutArgsOffsetOrZero);
    954     }
    955     // Update the running offset of the fused alloca region.
    956     CurrentOffset += Size;
    957   }
    958   // Round the offset up to the alignment granularity to use as the size.
    959   uint32_t TotalSize = Utils::applyAlignment(CurrentOffset, CombinedAlignment);
    960   // Ensure every alloca was assigned an offset.
    961   assert(Allocas.size() == Offsets.size());
    962 
    963   switch (BaseVariableType) {
    964   case BVT_UserPointer: {
    965     Variable *BaseVariable = makeVariable(IceType_i32);
    966     for (SizeT i = 0; i < Allocas.size(); ++i) {
    967       auto *Alloca = llvm::cast<InstAlloca>(Allocas[i]);
    968       // Emit a new addition operation to replace the alloca.
    969       Operand *AllocaOffset = Ctx->getConstantInt32(Offsets[i]);
    970       InstArithmetic *Add =
    971           InstArithmetic::create(this, InstArithmetic::Add, Alloca->getDest(),
    972                                  BaseVariable, AllocaOffset);
    973       Insts.push_front(Add);
    974       Alloca->setDeleted();
    975     }
    976     Operand *AllocaSize = Ctx->getConstantInt32(TotalSize);
    977     InstAlloca *CombinedAlloca =
    978         InstAlloca::create(this, BaseVariable, AllocaSize, CombinedAlignment);
    979     CombinedAlloca->setKnownFrameOffset();
    980     Insts.push_front(CombinedAlloca);
    981   } break;
    982   case BVT_StackPointer:
    983   case BVT_FramePointer: {
    984     for (SizeT i = 0; i < Allocas.size(); ++i) {
    985       auto *Alloca = llvm::cast<InstAlloca>(Allocas[i]);
    986       // Emit a fake definition of the rematerializable variable.
    987       Variable *Dest = Alloca->getDest();
    988       auto *Def = InstFakeDef::create(this, Dest);
    989       if (BaseVariableType == BVT_StackPointer)
    990         Dest->setRematerializable(getTarget()->getStackReg(), Offsets[i]);
    991       else
    992         Dest->setRematerializable(getTarget()->getFrameReg(), Offsets[i]);
    993       Insts.push_front(Def);
    994       Alloca->setDeleted();
    995     }
    996     // Allocate the fixed area in the function prolog.
    997     getTarget()->reserveFixedAllocaArea(TotalSize, CombinedAlignment);
    998   } break;
    999   }
   1000 }
   1001 
   1002 void Cfg::processAllocas(bool SortAndCombine) {
   1003   TimerMarker _(TimerStack::TT_alloca, this);
   1004   const uint32_t StackAlignment = getTarget()->getStackAlignment();
   1005   CfgNode *EntryNode = getEntryNode();
   1006   assert(EntryNode);
   1007   // LLVM enforces power of 2 alignment.
   1008   assert(llvm::isPowerOf2_32(StackAlignment));
   1009   // If the ABI's stack alignment is smaller than the vector size (16 bytes),
   1010   // conservatively use a frame pointer to allow for explicit alignment of the
   1011   // stack pointer. This needs to happen before register allocation so the frame
   1012   // pointer can be reserved.
   1013   if (getTarget()->needsStackPointerAlignment()) {
   1014     getTarget()->setHasFramePointer();
   1015   }
   1016   // Determine if there are large alignment allocations in the entry block or
   1017   // dynamic allocations (variable size in the entry block).
   1018   bool HasLargeAlignment = false;
   1019   bool HasDynamicAllocation = false;
   1020   for (Inst &Instr : EntryNode->getInsts()) {
   1021     if (Instr.isDeleted())
   1022       continue;
   1023     if (auto *Alloca = llvm::dyn_cast<InstAlloca>(&Instr)) {
   1024       uint32_t AlignmentParam = Alloca->getAlignInBytes();
   1025       if (AlignmentParam > StackAlignment)
   1026         HasLargeAlignment = true;
   1027       if (llvm::isa<Constant>(Alloca->getSizeInBytes()))
   1028         Alloca->setKnownFrameOffset();
   1029       else {
   1030         HasDynamicAllocation = true;
   1031         // If Allocas are not sorted, the first dynamic allocation causes
   1032         // later Allocas to be at unknown offsets relative to the stack/frame.
   1033         if (!SortAndCombine)
   1034           break;
   1035       }
   1036     }
   1037   }
   1038   // Don't do the heavyweight sorting and layout for low optimization levels.
   1039   if (!SortAndCombine)
   1040     return;
   1041   // Any alloca outside the entry block is a dynamic allocation.
   1042   for (CfgNode *Node : Nodes) {
   1043     if (Node == EntryNode)
   1044       continue;
   1045     for (Inst &Instr : Node->getInsts()) {
   1046       if (Instr.isDeleted())
   1047         continue;
   1048       if (llvm::isa<InstAlloca>(&Instr)) {
   1049         // Allocations outside the entry block require a frame pointer.
   1050         HasDynamicAllocation = true;
   1051         break;
   1052       }
   1053     }
   1054     if (HasDynamicAllocation)
   1055       break;
   1056   }
   1057   // Mark the target as requiring a frame pointer.
   1058   if (HasLargeAlignment || HasDynamicAllocation)
   1059     getTarget()->setHasFramePointer();
   1060   // Collect the Allocas into the two vectors.
   1061   // Allocas in the entry block that have constant size and alignment less
   1062   // than or equal to the function's stack alignment.
   1063   CfgVector<InstAlloca *> FixedAllocas;
   1064   // Allocas in the entry block that have constant size and alignment greater
   1065   // than the function's stack alignment.
   1066   CfgVector<InstAlloca *> AlignedAllocas;
   1067   // Maximum alignment used by any alloca.
   1068   uint32_t MaxAlignment = StackAlignment;
   1069   for (Inst &Instr : EntryNode->getInsts()) {
   1070     if (Instr.isDeleted())
   1071       continue;
   1072     if (auto *Alloca = llvm::dyn_cast<InstAlloca>(&Instr)) {
   1073       if (!llvm::isa<Constant>(Alloca->getSizeInBytes()))
   1074         continue;
   1075       uint32_t AlignmentParam = Alloca->getAlignInBytes();
   1076       // For default align=0, set it to the real value 1, to avoid any
   1077       // bit-manipulation problems below.
   1078       AlignmentParam = std::max(AlignmentParam, 1u);
   1079       assert(llvm::isPowerOf2_32(AlignmentParam));
   1080       if (HasDynamicAllocation && AlignmentParam > StackAlignment) {
   1081         // If we have both dynamic allocations and large stack alignments,
   1082         // high-alignment allocations are pulled out with their own base.
   1083         AlignedAllocas.push_back(Alloca);
   1084       } else {
   1085         FixedAllocas.push_back(Alloca);
   1086       }
   1087       MaxAlignment = std::max(AlignmentParam, MaxAlignment);
   1088     }
   1089   }
   1090   // Add instructions to the head of the entry block in reverse order.
   1091   InstList &Insts = getEntryNode()->getInsts();
   1092   if (HasDynamicAllocation && HasLargeAlignment) {
   1093     // We are using a frame pointer, but fixed large-alignment alloca addresses
   1094     // do not have a known offset from either the stack or frame pointer.
   1095     // They grow up from a user pointer from an alloca.
   1096     sortAndCombineAllocas(AlignedAllocas, MaxAlignment, Insts, BVT_UserPointer);
   1097     // Fixed size allocas are addressed relative to the frame pointer.
   1098     sortAndCombineAllocas(FixedAllocas, StackAlignment, Insts,
   1099                           BVT_FramePointer);
   1100   } else {
   1101     // Otherwise, fixed size allocas are addressed relative to the stack unless
   1102     // there are dynamic allocas.
   1103     const AllocaBaseVariableType BasePointerType =
   1104         (HasDynamicAllocation ? BVT_FramePointer : BVT_StackPointer);
   1105     sortAndCombineAllocas(FixedAllocas, MaxAlignment, Insts, BasePointerType);
   1106   }
   1107   if (!FixedAllocas.empty() || !AlignedAllocas.empty())
   1108     // No use calling findRematerializable() unless there is some
   1109     // rematerializable alloca instruction to seed it.
   1110     findRematerializable();
   1111 }
   1112 
   1113 namespace {
   1114 
   1115 // Helpers for findRematerializable().  For each of them, if a suitable
   1116 // rematerialization is found, the instruction's Dest variable is set to be
   1117 // rematerializable and it returns true, otherwise it returns false.
   1118 
   1119 bool rematerializeArithmetic(const Inst *Instr) {
   1120   // Check that it's an Arithmetic instruction with an Add operation.
   1121   auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr);
   1122   if (Arith == nullptr || Arith->getOp() != InstArithmetic::Add)
   1123     return false;
   1124   // Check that Src(0) is rematerializable.
   1125   auto *Src0Var = llvm::dyn_cast<Variable>(Arith->getSrc(0));
   1126   if (Src0Var == nullptr || !Src0Var->isRematerializable())
   1127     return false;
   1128   // Check that Src(1) is an immediate.
   1129   auto *Src1Imm = llvm::dyn_cast<ConstantInteger32>(Arith->getSrc(1));
   1130   if (Src1Imm == nullptr)
   1131     return false;
   1132   Arith->getDest()->setRematerializable(
   1133       Src0Var->getRegNum(), Src0Var->getStackOffset() + Src1Imm->getValue());
   1134   return true;
   1135 }
   1136 
   1137 bool rematerializeAssign(const Inst *Instr) {
   1138   // An InstAssign only originates from an inttoptr or ptrtoint instruction,
   1139   // which never occurs in a MINIMAL build.
   1140   if (BuildDefs::minimal())
   1141     return false;
   1142   // Check that it's an Assign instruction.
   1143   if (!llvm::isa<InstAssign>(Instr))
   1144     return false;
   1145   // Check that Src(0) is rematerializable.
   1146   auto *Src0Var = llvm::dyn_cast<Variable>(Instr->getSrc(0));
   1147   if (Src0Var == nullptr || !Src0Var->isRematerializable())
   1148     return false;
   1149   Instr->getDest()->setRematerializable(Src0Var->getRegNum(),
   1150                                         Src0Var->getStackOffset());
   1151   return true;
   1152 }
   1153 
   1154 bool rematerializeCast(const Inst *Instr) {
   1155   // An pointer-type bitcast never occurs in a MINIMAL build.
   1156   if (BuildDefs::minimal())
   1157     return false;
   1158   // Check that it's a Cast instruction with a Bitcast operation.
   1159   auto *Cast = llvm::dyn_cast<InstCast>(Instr);
   1160   if (Cast == nullptr || Cast->getCastKind() != InstCast::Bitcast)
   1161     return false;
   1162   // Check that Src(0) is rematerializable.
   1163   auto *Src0Var = llvm::dyn_cast<Variable>(Cast->getSrc(0));
   1164   if (Src0Var == nullptr || !Src0Var->isRematerializable())
   1165     return false;
   1166   // Check that Dest and Src(0) have the same type.
   1167   Variable *Dest = Cast->getDest();
   1168   if (Dest->getType() != Src0Var->getType())
   1169     return false;
   1170   Dest->setRematerializable(Src0Var->getRegNum(), Src0Var->getStackOffset());
   1171   return true;
   1172 }
   1173 
   1174 } // end of anonymous namespace
   1175 
   1176 /// Scan the function to find additional rematerializable variables.  This is
   1177 /// possible when the source operand of an InstAssignment is a rematerializable
   1178 /// variable, or the same for a pointer-type InstCast::Bitcast, or when an
   1179 /// InstArithmetic is an add of a rematerializable variable and an immediate.
   1180 /// Note that InstAssignment instructions and pointer-type InstCast::Bitcast
   1181 /// instructions generally only come about from the IceConverter's treatment of
   1182 /// inttoptr, ptrtoint, and bitcast instructions.  TODO(stichnot): Consider
   1183 /// other possibilities, however unlikely, such as InstArithmetic::Sub, or
   1184 /// commutativity.
   1185 void Cfg::findRematerializable() {
   1186   // Scan the instructions in order, and repeat until no new opportunities are
   1187   // found.  It may take more than one iteration because a variable's defining
   1188   // block may happen to come after a block where it is used, depending on the
   1189   // CfgNode linearization order.
   1190   bool FoundNewAssignment;
   1191   do {
   1192     FoundNewAssignment = false;
   1193     for (CfgNode *Node : getNodes()) {
   1194       // No need to process Phi instructions.
   1195       for (Inst &Instr : Node->getInsts()) {
   1196         if (Instr.isDeleted())
   1197           continue;
   1198         Variable *Dest = Instr.getDest();
   1199         if (Dest == nullptr || Dest->isRematerializable())
   1200           continue;
   1201         if (rematerializeArithmetic(&Instr) || rematerializeAssign(&Instr) ||
   1202             rematerializeCast(&Instr)) {
   1203           FoundNewAssignment = true;
   1204         }
   1205       }
   1206     }
   1207   } while (FoundNewAssignment);
   1208 }
   1209 
   1210 void Cfg::doAddressOpt() {
   1211   TimerMarker T(TimerStack::TT_doAddressOpt, this);
   1212   for (CfgNode *Node : Nodes)
   1213     Node->doAddressOpt();
   1214 }
   1215 
   1216 namespace {
   1217 // ShuffleVectorUtils implements helper functions for rematerializing
   1218 // shufflevector instructions from a sequence of extractelement/insertelement
   1219 // instructions. It looks for the following pattern:
   1220 //
   1221 // %t0 = extractelement A, %n0
   1222 // %t1 = extractelement B, %n1
   1223 // %t2 = extractelement C, %n2
   1224 // ...
   1225 // %tN = extractelement N, %nN
   1226 // %d0 = insertelement undef, %t0, 0
   1227 // %d1 = insertelement %d0, %t1, 1
   1228 // %d2 = insertelement %d1, %t2, 2
   1229 // ...
   1230 // %dest = insertelement %d_N-1, %tN, N
   1231 //
   1232 // where N is num_element(typeof(%dest)), and A, B, C, ... N are at most two
   1233 // distinct variables.
   1234 namespace ShuffleVectorUtils {
   1235 // findAllInserts is used when searching for all the insertelements that are
   1236 // used in a shufflevector operation. This function works recursively, when
   1237 // invoked with I = i, the function assumes Insts[i] is the last found
   1238 // insertelement in the chain. The next insertelement insertruction is saved in
   1239 // Insts[i+1].
   1240 bool findAllInserts(Cfg *Func, GlobalContext *Ctx, VariablesMetadata *VM,
   1241                     CfgVector<const Inst *> *Insts, SizeT I = 0) {
   1242   const bool Verbose = BuildDefs::dump() && Func->isVerbose(IceV_ShufMat);
   1243 
   1244   if (I > Insts->size()) {
   1245     if (Verbose) {
   1246       Ctx->getStrDump() << "\tToo many inserts.\n";
   1247     }
   1248     return false;
   1249   }
   1250 
   1251   const auto *LastInsert = Insts->at(I);
   1252   assert(llvm::isa<InstInsertElement>(LastInsert));
   1253 
   1254   if (I == Insts->size() - 1) {
   1255     // Matching against undef is not really needed because the value in Src(0)
   1256     // will be totally overwritten. We still enforce it anyways because the
   1257     // PNaCl toolchain generates the bitcode with it.
   1258     if (!llvm::isa<ConstantUndef>(LastInsert->getSrc(0))) {
   1259       if (Verbose) {
   1260         Ctx->getStrDump() << "\tSrc0 is not undef: " << I << " "
   1261                           << Insts->size();
   1262         LastInsert->dump(Func);
   1263         Ctx->getStrDump() << "\n";
   1264       }
   1265       return false;
   1266     }
   1267 
   1268     // The following loop ensures that the insertelements are sorted. In theory,
   1269     // we could relax this restriction and allow any order. As long as each
   1270     // index appears exactly once, this chain is still a candidate for becoming
   1271     // a shufflevector. The Insts vector is traversed backwards because the
   1272     // instructions are "enqueued" in reverse order.
   1273     int32_t ExpectedElement = 0;
   1274     for (const auto *I : reverse_range(*Insts)) {
   1275       if (llvm::cast<ConstantInteger32>(I->getSrc(2))->getValue() !=
   1276           ExpectedElement) {
   1277         return false;
   1278       }
   1279       ++ExpectedElement;
   1280     }
   1281     return true;
   1282   }
   1283 
   1284   const auto *Src0V = llvm::cast<Variable>(LastInsert->getSrc(0));
   1285   const auto *Def = VM->getSingleDefinition(Src0V);
   1286 
   1287   // Only optimize if the first operand in
   1288   //
   1289   //   Dest = insertelement A, B, 10
   1290   //
   1291   // is singly-def'ed.
   1292   if (Def == nullptr) {
   1293     if (Verbose) {
   1294       Ctx->getStrDump() << "\tmulti-def: ";
   1295       (*Insts)[I]->dump(Func);
   1296       Ctx->getStrDump() << "\n";
   1297     }
   1298     return false;
   1299   }
   1300 
   1301   // We also require the (single) definition to come from an insertelement
   1302   // instruction.
   1303   if (!llvm::isa<InstInsertElement>(Def)) {
   1304     if (Verbose) {
   1305       Ctx->getStrDump() << "\tnot insert element: ";
   1306       Def->dump(Func);
   1307       Ctx->getStrDump() << "\n";
   1308     }
   1309     return false;
   1310   }
   1311 
   1312   // Everything seems fine, so we save Def in Insts, and delegate the decision
   1313   // to findAllInserts.
   1314   (*Insts)[I + 1] = Def;
   1315 
   1316   return findAllInserts(Func, Ctx, VM, Insts, I + 1);
   1317 }
   1318 
   1319 // insertsLastElement returns true if Insert is inserting an element in the last
   1320 // position of a vector.
   1321 bool insertsLastElement(const Inst &Insert) {
   1322   const Type DestTy = Insert.getDest()->getType();
   1323   assert(isVectorType(DestTy));
   1324   const SizeT Elem =
   1325       llvm::cast<ConstantInteger32>(Insert.getSrc(2))->getValue();
   1326   return Elem == typeNumElements(DestTy) - 1;
   1327 }
   1328 
   1329 // findAllExtracts goes over all the insertelement instructions that are
   1330 // candidates to be replaced by a shufflevector, and searches for all the
   1331 // definitions of the elements being inserted. If all of the elements are the
   1332 // result of an extractelement instruction, and all of the extractelements
   1333 // operate on at most two different sources, than the instructions can be
   1334 // replaced by a shufflevector.
   1335 bool findAllExtracts(Cfg *Func, GlobalContext *Ctx, VariablesMetadata *VM,
   1336                      const CfgVector<const Inst *> &Insts, Variable **Src0,
   1337                      Variable **Src1, CfgVector<const Inst *> *Extracts) {
   1338   const bool Verbose = BuildDefs::dump() && Func->isVerbose(IceV_ShufMat);
   1339 
   1340   *Src0 = nullptr;
   1341   *Src1 = nullptr;
   1342   assert(Insts.size() > 0);
   1343   for (SizeT I = 0; I < Insts.size(); ++I) {
   1344     const auto *Insert = Insts.at(I);
   1345     const auto *Src1V = llvm::dyn_cast<Variable>(Insert->getSrc(1));
   1346     if (Src1V == nullptr) {
   1347       if (Verbose) {
   1348         Ctx->getStrDump() << "src(1) is not a variable: ";
   1349         Insert->dump(Func);
   1350         Ctx->getStrDump() << "\n";
   1351       }
   1352       return false;
   1353     }
   1354 
   1355     const auto *Def = VM->getSingleDefinition(Src1V);
   1356     if (Def == nullptr) {
   1357       if (Verbose) {
   1358         Ctx->getStrDump() << "multi-def src(1): ";
   1359         Insert->dump(Func);
   1360         Ctx->getStrDump() << "\n";
   1361       }
   1362       return false;
   1363     }
   1364 
   1365     if (!llvm::isa<InstExtractElement>(Def)) {
   1366       if (Verbose) {
   1367         Ctx->getStrDump() << "not extractelement: ";
   1368         Def->dump(Func);
   1369         Ctx->getStrDump() << "\n";
   1370       }
   1371       return false;
   1372     }
   1373 
   1374     auto *Src = llvm::cast<Variable>(Def->getSrc(0));
   1375     if (*Src0 == nullptr) {
   1376       // No sources yet. Save Src to Src0.
   1377       *Src0 = Src;
   1378     } else if (*Src1 == nullptr) {
   1379       // We already have a source, so we might save Src in Src1 -- but only if
   1380       // Src0 is not Src.
   1381       if (*Src0 != Src) {
   1382         *Src1 = Src;
   1383       }
   1384     } else if (Src != *Src0 && Src != *Src1) {
   1385       // More than two sources, so we can't rematerialize the shufflevector
   1386       // instruction.
   1387       if (Verbose) {
   1388         Ctx->getStrDump() << "Can't shuffle more than two sources.\n";
   1389       }
   1390       return false;
   1391     }
   1392 
   1393     (*Extracts)[I] = Def;
   1394   }
   1395 
   1396   // We should have seen at least one source operand.
   1397   assert(*Src0 != nullptr);
   1398 
   1399   // If a second source was not seen, then we just make Src1 = Src0 to simplify
   1400   // things down stream. This should not matter, as all of the indexes in the
   1401   // shufflevector instruction will point to Src0.
   1402   if (*Src1 == nullptr) {
   1403     *Src1 = *Src0;
   1404   }
   1405 
   1406   return true;
   1407 }
   1408 
   1409 } // end of namespace ShuffleVectorUtils
   1410 } // end of anonymous namespace
   1411 
   1412 void Cfg::materializeVectorShuffles() {
   1413   const bool Verbose = BuildDefs::dump() && isVerbose(IceV_ShufMat);
   1414 
   1415   std::unique_ptr<OstreamLocker> L;
   1416   if (Verbose) {
   1417     L.reset(new OstreamLocker(getContext()));
   1418     getContext()->getStrDump() << "\nShuffle materialization:\n";
   1419   }
   1420 
   1421   // MaxVectorElements is the maximum number of elements in the vector types
   1422   // handled by Subzero. We use it to create the Inserts and Extracts vectors
   1423   // with the appropriate size, thus avoiding resize() calls.
   1424   const SizeT MaxVectorElements = typeNumElements(IceType_v16i8);
   1425   CfgVector<const Inst *> Inserts(MaxVectorElements);
   1426   CfgVector<const Inst *> Extracts(MaxVectorElements);
   1427 
   1428   TimerMarker T(TimerStack::TT_materializeVectorShuffles, this);
   1429   for (CfgNode *Node : Nodes) {
   1430     for (auto &Instr : Node->getInsts()) {
   1431       if (!llvm::isa<InstInsertElement>(Instr)) {
   1432         continue;
   1433       }
   1434       if (!ShuffleVectorUtils::insertsLastElement(Instr)) {
   1435         // To avoid wasting time, we only start the pattern match at the last
   1436         // insertelement instruction -- and go backwards from there.
   1437         continue;
   1438       }
   1439       if (Verbose) {
   1440         getContext()->getStrDump() << "\tCandidate: ";
   1441         Instr.dump(this);
   1442         getContext()->getStrDump() << "\n";
   1443       }
   1444       Inserts.resize(typeNumElements(Instr.getDest()->getType()));
   1445       Inserts[0] = &Instr;
   1446       if (!ShuffleVectorUtils::findAllInserts(this, getContext(),
   1447                                               VMetadata.get(), &Inserts)) {
   1448         // If we fail to find a sequence of insertelements, we stop the
   1449         // optimization.
   1450         if (Verbose) {
   1451           getContext()->getStrDump() << "\tFalse alarm.\n";
   1452         }
   1453         continue;
   1454       }
   1455       if (Verbose) {
   1456         getContext()->getStrDump() << "\tFound the following insertelement: \n";
   1457         for (auto *I : reverse_range(Inserts)) {
   1458           getContext()->getStrDump() << "\t\t";
   1459           I->dump(this);
   1460           getContext()->getStrDump() << "\n";
   1461         }
   1462       }
   1463       Extracts.resize(Inserts.size());
   1464       Variable *Src0;
   1465       Variable *Src1;
   1466       if (!ShuffleVectorUtils::findAllExtracts(this, getContext(),
   1467                                                VMetadata.get(), Inserts, &Src0,
   1468                                                &Src1, &Extracts)) {
   1469         // If we fail to match the definitions of the insertelements' sources
   1470         // with extractelement instructions -- or if those instructions operate
   1471         // on more than two different variables -- we stop the optimization.
   1472         if (Verbose) {
   1473           getContext()->getStrDump() << "\tFailed to match extractelements.\n";
   1474         }
   1475         continue;
   1476       }
   1477       if (Verbose) {
   1478         getContext()->getStrDump()
   1479             << "\tFound the following insert/extract element pairs: \n";
   1480         for (SizeT I = 0; I < Inserts.size(); ++I) {
   1481           const SizeT Pos = Inserts.size() - I - 1;
   1482           getContext()->getStrDump() << "\t\tInsert : ";
   1483           Inserts[Pos]->dump(this);
   1484           getContext()->getStrDump() << "\n\t\tExtract: ";
   1485           Extracts[Pos]->dump(this);
   1486           getContext()->getStrDump() << "\n";
   1487         }
   1488       }
   1489 
   1490       assert(Src0 != nullptr);
   1491       assert(Src1 != nullptr);
   1492 
   1493       auto *ShuffleVector =
   1494           InstShuffleVector::create(this, Instr.getDest(), Src0, Src1);
   1495       assert(ShuffleVector->getSrc(0) == Src0);
   1496       assert(ShuffleVector->getSrc(1) == Src1);
   1497       for (SizeT I = 0; I < Extracts.size(); ++I) {
   1498         const SizeT Pos = Extracts.size() - I - 1;
   1499         auto *Index = llvm::cast<ConstantInteger32>(Extracts[Pos]->getSrc(1));
   1500         if (Src0 == Extracts[Pos]->getSrc(0)) {
   1501           ShuffleVector->addIndex(Index);
   1502         } else {
   1503           ShuffleVector->addIndex(llvm::cast<ConstantInteger32>(
   1504               Ctx->getConstantInt32(Index->getValue() + Extracts.size())));
   1505         }
   1506       }
   1507 
   1508       if (Verbose) {
   1509         getContext()->getStrDump() << "Created: ";
   1510         ShuffleVector->dump(this);
   1511         getContext()->getStrDump() << "\n";
   1512       }
   1513 
   1514       Instr.setDeleted();
   1515       auto &LoweringContext = getTarget()->getContext();
   1516       LoweringContext.setInsertPoint(instToIterator(&Instr));
   1517       LoweringContext.insert(ShuffleVector);
   1518     }
   1519   }
   1520 }
   1521 
   1522 void Cfg::doNopInsertion() {
   1523   if (!getFlags().getShouldDoNopInsertion())
   1524     return;
   1525   TimerMarker T(TimerStack::TT_doNopInsertion, this);
   1526   RandomNumberGenerator RNG(getFlags().getRandomSeed(), RPE_NopInsertion,
   1527                             SequenceNumber);
   1528   for (CfgNode *Node : Nodes)
   1529     Node->doNopInsertion(RNG);
   1530 }
   1531 
   1532 void Cfg::genCode() {
   1533   TimerMarker T(TimerStack::TT_genCode, this);
   1534   for (CfgNode *Node : Nodes)
   1535     Node->genCode();
   1536 }
   1537 
   1538 // Compute the stack frame layout.
   1539 void Cfg::genFrame() {
   1540   TimerMarker T(TimerStack::TT_genFrame, this);
   1541   getTarget()->addProlog(Entry);
   1542   for (CfgNode *Node : Nodes)
   1543     if (Node->getHasReturn())
   1544       getTarget()->addEpilog(Node);
   1545 }
   1546 
   1547 void Cfg::generateLoopInfo() {
   1548   TimerMarker T(TimerStack::TT_computeLoopNestDepth, this);
   1549   LoopInfo = ComputeLoopInfo(this);
   1550 }
   1551 
   1552 // This is a lightweight version of live-range-end calculation. Marks the last
   1553 // use of only those variables whose definition and uses are completely with a
   1554 // single block. It is a quick single pass and doesn't need to iterate until
   1555 // convergence.
   1556 void Cfg::livenessLightweight() {
   1557   TimerMarker T(TimerStack::TT_livenessLightweight, this);
   1558   getVMetadata()->init(VMK_Uses);
   1559   for (CfgNode *Node : Nodes)
   1560     Node->livenessLightweight();
   1561 }
   1562 
   1563 void Cfg::liveness(LivenessMode Mode) {
   1564   TimerMarker T(TimerStack::TT_liveness, this);
   1565   // Destroying the previous (if any) Liveness information clears the Liveness
   1566   // allocator TLS pointer.
   1567   Live = nullptr;
   1568   Live = Liveness::create(this, Mode);
   1569 
   1570   getVMetadata()->init(VMK_Uses);
   1571   Live->init();
   1572 
   1573   // Initialize with all nodes needing to be processed.
   1574   BitVector NeedToProcess(Nodes.size(), true);
   1575   while (NeedToProcess.any()) {
   1576     // Iterate in reverse topological order to speed up convergence.
   1577     for (CfgNode *Node : reverse_range(Nodes)) {
   1578       if (NeedToProcess[Node->getIndex()]) {
   1579         NeedToProcess[Node->getIndex()] = false;
   1580         bool Changed = Node->liveness(getLiveness());
   1581         if (Changed) {
   1582           // If the beginning-of-block liveness changed since the last
   1583           // iteration, mark all in-edges as needing to be processed.
   1584           for (CfgNode *Pred : Node->getInEdges())
   1585             NeedToProcess[Pred->getIndex()] = true;
   1586         }
   1587       }
   1588     }
   1589   }
   1590   if (Mode == Liveness_Intervals) {
   1591     // Reset each variable's live range.
   1592     for (Variable *Var : Variables)
   1593       Var->resetLiveRange();
   1594   }
   1595   // Make a final pass over each node to delete dead instructions, collect the
   1596   // first and last instruction numbers, and add live range segments for that
   1597   // node.
   1598   for (CfgNode *Node : Nodes) {
   1599     InstNumberT FirstInstNum = Inst::NumberSentinel;
   1600     InstNumberT LastInstNum = Inst::NumberSentinel;
   1601     for (Inst &I : Node->getPhis()) {
   1602       I.deleteIfDead();
   1603       if (Mode == Liveness_Intervals && !I.isDeleted()) {
   1604         if (FirstInstNum == Inst::NumberSentinel)
   1605           FirstInstNum = I.getNumber();
   1606         assert(I.getNumber() > LastInstNum);
   1607         LastInstNum = I.getNumber();
   1608       }
   1609     }
   1610     for (Inst &I : Node->getInsts()) {
   1611       I.deleteIfDead();
   1612       if (Mode == Liveness_Intervals && !I.isDeleted()) {
   1613         if (FirstInstNum == Inst::NumberSentinel)
   1614           FirstInstNum = I.getNumber();
   1615         assert(I.getNumber() > LastInstNum);
   1616         LastInstNum = I.getNumber();
   1617       }
   1618     }
   1619     if (Mode == Liveness_Intervals) {
   1620       // Special treatment for live in-args. Their liveness needs to extend
   1621       // beyond the beginning of the function, otherwise an arg whose only use
   1622       // is in the first instruction will end up having the trivial live range
   1623       // [2,2) and will *not* interfere with other arguments. So if the first
   1624       // instruction of the method is "r=arg1+arg2", both args may be assigned
   1625       // the same register. This is accomplished by extending the entry block's
   1626       // instruction range from [2,n) to [1,n) which will transform the
   1627       // problematic [2,2) live ranges into [1,2).  This extension works because
   1628       // the entry node is guaranteed to have the lowest instruction numbers.
   1629       if (Node == getEntryNode()) {
   1630         FirstInstNum = Inst::NumberExtended;
   1631         // Just in case the entry node somehow contains no instructions...
   1632         if (LastInstNum == Inst::NumberSentinel)
   1633           LastInstNum = FirstInstNum;
   1634       }
   1635       // If this node somehow contains no instructions, don't bother trying to
   1636       // add liveness intervals for it, because variables that are live-in and
   1637       // live-out will have a bogus interval added.
   1638       if (FirstInstNum != Inst::NumberSentinel)
   1639         Node->livenessAddIntervals(getLiveness(), FirstInstNum, LastInstNum);
   1640     }
   1641   }
   1642 }
   1643 
   1644 // Traverse every Variable of every Inst and verify that it appears within the
   1645 // Variable's computed live range.
   1646 bool Cfg::validateLiveness() const {
   1647   TimerMarker T(TimerStack::TT_validateLiveness, this);
   1648   bool Valid = true;
   1649   OstreamLocker L(Ctx);
   1650   Ostream &Str = Ctx->getStrDump();
   1651   for (CfgNode *Node : Nodes) {
   1652     Inst *FirstInst = nullptr;
   1653     for (Inst &Instr : Node->getInsts()) {
   1654       if (Instr.isDeleted())
   1655         continue;
   1656       if (FirstInst == nullptr)
   1657         FirstInst = &Instr;
   1658       InstNumberT InstNumber = Instr.getNumber();
   1659       if (Variable *Dest = Instr.getDest()) {
   1660         if (!Dest->getIgnoreLiveness()) {
   1661           bool Invalid = false;
   1662           constexpr bool IsDest = true;
   1663           if (!Dest->getLiveRange().containsValue(InstNumber, IsDest))
   1664             Invalid = true;
   1665           // Check that this instruction actually *begins* Dest's live range,
   1666           // by checking that Dest is not live in the previous instruction. As
   1667           // a special exception, we don't check this for the first instruction
   1668           // of the block, because a Phi temporary may be live at the end of
   1669           // the previous block, and if it is also assigned in the first
   1670           // instruction of this block, the adjacent live ranges get merged.
   1671           if (&Instr != FirstInst && !Instr.isDestRedefined() &&
   1672               Dest->getLiveRange().containsValue(InstNumber - 1, IsDest))
   1673             Invalid = true;
   1674           if (Invalid) {
   1675             Valid = false;
   1676             Str << "Liveness error: inst " << Instr.getNumber() << " dest ";
   1677             Dest->dump(this);
   1678             Str << " live range " << Dest->getLiveRange() << "\n";
   1679           }
   1680         }
   1681       }
   1682       FOREACH_VAR_IN_INST(Var, Instr) {
   1683         static constexpr bool IsDest = false;
   1684         if (!Var->getIgnoreLiveness() &&
   1685             !Var->getLiveRange().containsValue(InstNumber, IsDest)) {
   1686           Valid = false;
   1687           Str << "Liveness error: inst " << Instr.getNumber() << " var ";
   1688           Var->dump(this);
   1689           Str << " live range " << Var->getLiveRange() << "\n";
   1690         }
   1691       }
   1692     }
   1693   }
   1694   return Valid;
   1695 }
   1696 
   1697 void Cfg::contractEmptyNodes() {
   1698   // If we're decorating the asm output with register liveness info, this
   1699   // information may become corrupted or incorrect after contracting nodes that
   1700   // contain only redundant assignments. As such, we disable this pass when
   1701   // DecorateAsm is specified. This may make the resulting code look more
   1702   // branchy, but it should have no effect on the register assignments.
   1703   if (getFlags().getDecorateAsm())
   1704     return;
   1705   for (CfgNode *Node : Nodes) {
   1706     Node->contractIfEmpty();
   1707   }
   1708 }
   1709 
   1710 void Cfg::doBranchOpt() {
   1711   TimerMarker T(TimerStack::TT_doBranchOpt, this);
   1712   for (auto I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
   1713     auto NextNode = I + 1;
   1714     (*I)->doBranchOpt(NextNode == E ? nullptr : *NextNode);
   1715   }
   1716 }
   1717 
   1718 void Cfg::markNodesForSandboxing() {
   1719   for (const InstJumpTable *JT : JumpTables)
   1720     for (SizeT I = 0; I < JT->getNumTargets(); ++I)
   1721       JT->getTarget(I)->setNeedsAlignment();
   1722 }
   1723 
   1724 // ======================== Dump routines ======================== //
   1725 
   1726 // emitTextHeader() is not target-specific (apart from what is abstracted by
   1727 // the Assembler), so it is defined here rather than in the target lowering
   1728 // class.
   1729 void Cfg::emitTextHeader(GlobalString Name, GlobalContext *Ctx,
   1730                          const Assembler *Asm) {
   1731   if (!BuildDefs::dump())
   1732     return;
   1733   Ostream &Str = Ctx->getStrEmit();
   1734   Str << "\t.text\n";
   1735   if (getFlags().getFunctionSections())
   1736     Str << "\t.section\t.text." << Name << ",\"ax\",%progbits\n";
   1737   if (!Asm->getInternal() || getFlags().getDisableInternal()) {
   1738     Str << "\t.globl\t" << Name << "\n";
   1739     Str << "\t.type\t" << Name << ",%function\n";
   1740   }
   1741   Str << "\t" << Asm->getAlignDirective() << " "
   1742       << Asm->getBundleAlignLog2Bytes() << ",0x";
   1743   for (uint8_t I : Asm->getNonExecBundlePadding())
   1744     Str.write_hex(I);
   1745   Str << "\n";
   1746   Str << Name << ":\n";
   1747 }
   1748 
   1749 void Cfg::emitJumpTables() {
   1750   switch (getFlags().getOutFileType()) {
   1751   case FT_Elf:
   1752   case FT_Iasm: {
   1753     // The emission needs to be delayed until the after the text section so
   1754     // save the offsets in the global context.
   1755     for (const InstJumpTable *JumpTable : JumpTables) {
   1756       Ctx->addJumpTableData(JumpTable->toJumpTableData(getAssembler()));
   1757     }
   1758   } break;
   1759   case FT_Asm: {
   1760     // Emit the assembly directly so we don't need to hang on to all the names
   1761     for (const InstJumpTable *JumpTable : JumpTables)
   1762       getTarget()->emitJumpTable(this, JumpTable);
   1763   } break;
   1764   }
   1765 }
   1766 
   1767 void Cfg::emit() {
   1768   if (!BuildDefs::dump())
   1769     return;
   1770   TimerMarker T(TimerStack::TT_emitAsm, this);
   1771   if (getFlags().getDecorateAsm()) {
   1772     renumberInstructions();
   1773     getVMetadata()->init(VMK_Uses);
   1774     liveness(Liveness_Basic);
   1775     dump("After recomputing liveness for -decorate-asm");
   1776   }
   1777   OstreamLocker L(Ctx);
   1778   Ostream &Str = Ctx->getStrEmit();
   1779   const Assembler *Asm = getAssembler<>();
   1780   const bool NeedSandboxing = getFlags().getUseSandboxing();
   1781 
   1782   emitTextHeader(FunctionName, Ctx, Asm);
   1783   if (getFlags().getDecorateAsm()) {
   1784     for (Variable *Var : getVariables()) {
   1785       if (Var->hasKnownStackOffset() && !Var->isRematerializable()) {
   1786         Str << "\t" << Var->getSymbolicStackOffset() << " = "
   1787             << Var->getStackOffset() << "\n";
   1788       }
   1789     }
   1790   }
   1791   for (CfgNode *Node : Nodes) {
   1792     if (NeedSandboxing && Node->needsAlignment()) {
   1793       Str << "\t" << Asm->getAlignDirective() << " "
   1794           << Asm->getBundleAlignLog2Bytes() << "\n";
   1795     }
   1796     Node->emit(this);
   1797   }
   1798   emitJumpTables();
   1799   Str << "\n";
   1800 }
   1801 
   1802 void Cfg::emitIAS() {
   1803   TimerMarker T(TimerStack::TT_emitAsm, this);
   1804   // The emitIAS() routines emit into the internal assembler buffer, so there's
   1805   // no need to lock the streams.
   1806   const bool NeedSandboxing = getFlags().getUseSandboxing();
   1807   for (CfgNode *Node : Nodes) {
   1808     if (NeedSandboxing && Node->needsAlignment())
   1809       getAssembler()->alignCfgNode();
   1810     Node->emitIAS(this);
   1811   }
   1812   emitJumpTables();
   1813 }
   1814 
   1815 size_t Cfg::getTotalMemoryMB() const {
   1816   constexpr size_t _1MB = 1024 * 1024;
   1817   assert(Allocator != nullptr);
   1818   assert(CfgAllocatorTraits::current() == Allocator.get());
   1819   return Allocator->getTotalMemory() / _1MB;
   1820 }
   1821 
   1822 size_t Cfg::getLivenessMemoryMB() const {
   1823   constexpr size_t _1MB = 1024 * 1024;
   1824   if (Live == nullptr) {
   1825     return 0;
   1826   }
   1827   return Live->getAllocator()->getTotalMemory() / _1MB;
   1828 }
   1829 
   1830 // Dumps the IR with an optional introductory message.
   1831 void Cfg::dump(const char *Message) {
   1832   if (!BuildDefs::dump())
   1833     return;
   1834   if (!isVerbose())
   1835     return;
   1836   OstreamLocker L(Ctx);
   1837   Ostream &Str = Ctx->getStrDump();
   1838   if (Message[0])
   1839     Str << "================ " << Message << " ================\n";
   1840   if (isVerbose(IceV_Mem)) {
   1841     Str << "Memory size = " << getTotalMemoryMB() << " MB\n";
   1842   }
   1843   setCurrentNode(getEntryNode());
   1844   // Print function name+args
   1845   if (isVerbose(IceV_Instructions)) {
   1846     Str << "define ";
   1847     if (getInternal() && !getFlags().getDisableInternal())
   1848       Str << "internal ";
   1849     Str << ReturnType << " @" << getFunctionName() << "(";
   1850     for (SizeT i = 0; i < Args.size(); ++i) {
   1851       if (i > 0)
   1852         Str << ", ";
   1853       Str << Args[i]->getType() << " ";
   1854       Args[i]->dump(this);
   1855     }
   1856     // Append an extra copy of the function name here, in order to print its
   1857     // size stats but not mess up lit tests.
   1858     Str << ") { # " << getFunctionNameAndSize() << "\n";
   1859   }
   1860   resetCurrentNode();
   1861   if (isVerbose(IceV_Liveness)) {
   1862     // Print summary info about variables
   1863     for (Variable *Var : Variables) {
   1864       Str << "// multiblock=";
   1865       if (getVMetadata()->isTracked(Var))
   1866         Str << getVMetadata()->isMultiBlock(Var);
   1867       else
   1868         Str << "?";
   1869       Str << " defs=";
   1870       bool FirstPrint = true;
   1871       if (VMetadata->getKind() != VMK_Uses) {
   1872         if (const Inst *FirstDef = VMetadata->getFirstDefinition(Var)) {
   1873           Str << FirstDef->getNumber();
   1874           FirstPrint = false;
   1875         }
   1876       }
   1877       if (VMetadata->getKind() == VMK_All) {
   1878         for (const Inst *Instr : VMetadata->getLatterDefinitions(Var)) {
   1879           if (!FirstPrint)
   1880             Str << ",";
   1881           Str << Instr->getNumber();
   1882           FirstPrint = false;
   1883         }
   1884       }
   1885       Str << " weight=" << Var->getWeight(this) << " ";
   1886       Var->dump(this);
   1887       Str << " LIVE=" << Var->getLiveRange() << "\n";
   1888     }
   1889   }
   1890   // Print each basic block
   1891   for (CfgNode *Node : Nodes)
   1892     Node->dump(this);
   1893   if (isVerbose(IceV_Instructions))
   1894     Str << "}\n";
   1895 }
   1896 
   1897 } // end of namespace Ice
   1898