subzero/src/IceCfg.cpp

//===- subzero/src/IceCfg.cpp - Control flow graph implementation ---------===//
//
//                        The Subzero Code Generator
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
///
/// \file
/// \brief Implements the Cfg class.
///
//===----------------------------------------------------------------------===//

#include "IceCfg.h"

#include "IceAssembler.h"
#include "IceBitVector.h"
#include "IceCfgNode.h"
#include "IceClFlags.h"
#include "IceDefs.h"
#include "IceELFObjectWriter.h"
#include "IceGlobalInits.h"
#include "IceInst.h"
#include "IceInstVarIter.h"
#include "IceInstrumentation.h"
#include "IceLiveness.h"
#include "IceLoopAnalyzer.h"
#include "IceOperand.h"
#include "IceTargetLowering.h"

#include <memory>
#include <utility>

namespace Ice {

Cfg::Cfg(GlobalContext *Ctx, uint32_t SequenceNumber)
    : Allocator(createAllocator()), Ctx(Ctx), SequenceNumber(SequenceNumber),
      VMask(getFlags().getVerbose()), FunctionName(),
      NextInstNumber(Inst::NumberInitial), Live(nullptr) {
  NodeStrings.reset(new StringPool);
  VarStrings.reset(new StringPool);
  CfgLocalAllocatorScope _(this);
  Target = TargetLowering::createLowering(getFlags().getTargetArch(), this);
  VMetadata.reset(new VariablesMetadata(this));
  TargetAssembler = Target->createAssembler();

  if (getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Randomize) {
    // If -randomize-pool-immediates=randomize, create a random number
    // generator to generate a cookie for constant blinding.
    RandomNumberGenerator RNG(getFlags().getRandomSeed(), RPE_ConstantBlinding,
                              this->SequenceNumber);
    ConstantBlindingCookie =
        (uint32_t)RNG.next((uint64_t)std::numeric_limits<uint32_t>::max() + 1);
  }
}

Cfg::~Cfg() {
  assert(CfgAllocatorTraits::current() == nullptr);
  if (getFlags().getDumpStrings()) {
    OstreamLocker _(Ctx);
    Ostream &Str = Ctx->getStrDump();
    getNodeStrings()->dump(Str);
    getVarStrings()->dump(Str);
  }
}

// Called in the initalizer list of Cfg's constructor to create the Allocator
// and set it as TLS before any other member fields are constructed, since they
// may depend on it.
ArenaAllocator *Cfg::createAllocator() {
  ArenaAllocator *Allocator = new ArenaAllocator();
  CfgAllocatorTraits::set_current(Allocator);
  return Allocator;
}

/// Create a string like "foo(i=123:b=9)" indicating the function name, number
/// of high-level instructions, and number of basic blocks.  This string is only
/// used for dumping and other diagnostics, and the idea is that given a set of
/// functions to debug a problem on, it's easy to find the smallest or simplest
/// function to attack.  Note that the counts may change somewhat depending on
/// what point it is called during the translation passes.
std::string Cfg::getFunctionNameAndSize() const {
  if (!BuildDefs::dump())
    return getFunctionName().toString();
  SizeT NodeCount = 0;
  SizeT InstCount = 0;
  for (CfgNode *Node : getNodes()) {
    ++NodeCount;
    // Note: deleted instructions are *not* ignored.
    InstCount += Node->getPhis().size();
    for (Inst &I : Node->getInsts()) {
      if (!llvm::isa<InstTarget>(&I))
        ++InstCount;
    }
  }
  return getFunctionName() + "(i=" + std::to_string(InstCount) + ":b=" +
         std::to_string(NodeCount) + ")";
}

void Cfg::setError(const std::string &Message) {
  HasError = true;
  ErrorMessage = Message;
}

CfgNode *Cfg::makeNode() {
  SizeT LabelIndex = Nodes.size();
  auto *Node = CfgNode::create(this, LabelIndex);
  Nodes.push_back(Node);
  return Node;
}

void Cfg::swapNodes(NodeList &NewNodes) {
  assert(Nodes.size() == NewNodes.size());
  Nodes.swap(NewNodes);
  for (SizeT I = 0, NumNodes = getNumNodes(); I < NumNodes; ++I)
    Nodes[I]->resetIndex(I);
}

template <> Variable *Cfg::makeVariable<Variable>(Type Ty) {
  SizeT Index = Variables.size();
  Variable *Var;
  if (Target->shouldSplitToVariableVecOn32(Ty)) {
    Var = VariableVecOn32::create(this, Ty, Index);
  } else if (Target->shouldSplitToVariable64On32(Ty)) {
    Var = Variable64On32::create(this, Ty, Index);
  } else {
    Var = Variable::create(this, Ty, Index);
  }
  Variables.push_back(Var);
  return Var;
}

void Cfg::addArg(Variable *Arg) {
  Arg->setIsArg();
  Args.push_back(Arg);
}

void Cfg::addImplicitArg(Variable *Arg) {
  Arg->setIsImplicitArg();
  ImplicitArgs.push_back(Arg);
}

// Returns whether the stack frame layout has been computed yet. This is used
// for dumping the stack frame location of Variables.
bool Cfg::hasComputedFrame() const { return getTarget()->hasComputedFrame(); }

namespace {
constexpr char BlockNameGlobalPrefix[] = ".L$profiler$block_name$";
constexpr char BlockStatsGlobalPrefix[] = ".L$profiler$block_info$";
} // end of anonymous namespace

void Cfg::createNodeNameDeclaration(const std::string &NodeAsmName) {
  auto *Var = VariableDeclaration::create(GlobalInits.get());
  Var->setName(Ctx, BlockNameGlobalPrefix + NodeAsmName);
  Var->setIsConstant(true);
  Var->addInitializer(VariableDeclaration::DataInitializer::create(
      GlobalInits.get(), NodeAsmName.data(), NodeAsmName.size() + 1));
  const SizeT Int64ByteSize = typeWidthInBytes(IceType_i64);
  Var->setAlignment(Int64ByteSize); // Wasteful, 32-bit could use 4 bytes.
  GlobalInits->push_back(Var);
}

void Cfg::createBlockProfilingInfoDeclaration(
    const std::string &NodeAsmName, VariableDeclaration *NodeNameDeclaration) {
  auto *Var = VariableDeclaration::create(GlobalInits.get());
  Var->setName(Ctx, BlockStatsGlobalPrefix + NodeAsmName);
  const SizeT Int64ByteSize = typeWidthInBytes(IceType_i64);
  Var->addInitializer(VariableDeclaration::ZeroInitializer::create(
      GlobalInits.get(), Int64ByteSize));

  const RelocOffsetT NodeNameDeclarationOffset = 0;
  Var->addInitializer(VariableDeclaration::RelocInitializer::create(
      GlobalInits.get(), NodeNameDeclaration,
      {RelocOffset::create(Ctx, NodeNameDeclarationOffset)}));
  Var->setAlignment(Int64ByteSize);
  GlobalInits->push_back(Var);
}

void Cfg::profileBlocks() {
  if (GlobalInits == nullptr)
    GlobalInits.reset(new VariableDeclarationList());

  for (CfgNode *Node : Nodes) {
    const std::string NodeAsmName = Node->getAsmName();
    createNodeNameDeclaration(NodeAsmName);
    createBlockProfilingInfoDeclaration(NodeAsmName, GlobalInits->back());
    Node->profileExecutionCount(GlobalInits->back());
  }
}

bool Cfg::isProfileGlobal(const VariableDeclaration &Var) {
  if (!Var.getName().hasStdString())
    return false;
  return Var.getName().toString().find(BlockStatsGlobalPrefix) == 0;
}

void Cfg::addCallToProfileSummary() {
  // The call(s) to __Sz_profile_summary are added by the profiler in functions
  // that cause the program to exit. This function is defined in
  // runtime/szrt_profiler.c.
  Constant *ProfileSummarySym =
      Ctx->getConstantExternSym(Ctx->getGlobalString("__Sz_profile_summary"));
  constexpr SizeT NumArgs = 0;
  constexpr Variable *Void = nullptr;
  constexpr bool HasTailCall = false;
  auto *Call =
      InstCall::create(this, NumArgs, Void, ProfileSummarySym, HasTailCall);
  getEntryNode()->getInsts().push_front(Call);
}

void Cfg::translate() {
  if (hasError())
    return;
  // Cache the possibly-overridden optimization level once translation begins.
  // It would be nicer to do this in the constructor, but we need to wait until
  // after setFunctionName() has a chance to be called.
  OptimizationLevel =
      getFlags().matchForceO2(getFunctionName(), getSequenceNumber())
          ? Opt_2
          : getFlags().getOptLevel();
  if (BuildDefs::timers()) {
    if (getFlags().matchTimingFocus(getFunctionName(), getSequenceNumber())) {
      setFocusedTiming();
      getContext()->resetTimer(GlobalContext::TSK_Default);
    }
  }
  if (BuildDefs::dump()) {
    if (isVerbose(IceV_Status) &&
        getFlags().matchTestStatus(getFunctionName(), getSequenceNumber())) {
      getContext()->getStrDump() << ">>>Translating "
                                 << getFunctionNameAndSize()
                                 << " seq=" << getSequenceNumber() << "\n";
    }
  }
  TimerMarker T_func(getContext(), getFunctionName().toStringOrEmpty());
  TimerMarker T(TimerStack::TT_translate, this);

  dump("Initial CFG");

  if (getFlags().getEnableBlockProfile()) {
    profileBlocks();
    // TODO(jpp): this is fragile, at best. Figure out a better way of
    // detecting exit functions.
    if (getFunctionName().toStringOrEmpty() == "exit") {
      addCallToProfileSummary();
    }
    dump("Profiled CFG");
  }

  // Create the Hi and Lo variables where a split was needed
  for (Variable *Var : Variables) {
    if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Var)) {
      Var64On32->initHiLo(this);
    } else if (auto *VarVecOn32 = llvm::dyn_cast<VariableVecOn32>(Var)) {
      VarVecOn32->initVecElement(this);
    }
  }

  // Instrument the Cfg, e.g. with AddressSanitizer
  if (!BuildDefs::minimal() && getFlags().getSanitizeAddresses()) {
    getContext()->instrumentFunc(this);
    dump("Instrumented CFG");
  }

  // The set of translation passes and their order are determined by the
  // target.
  getTarget()->translate();

  dump("Final output");
  if (getFocusedTiming()) {
    getContext()->dumpLocalTimers(getFunctionName().toString());
  }
}

void Cfg::fixPhiNodes() {
  for (auto *Node : Nodes) {
    // Fix all the phi edges since WASM can't tell how to make them correctly at
    // the beginning.
    assert(Node);
    const auto &InEdges = Node->getInEdges();
    for (auto &Instr : Node->getPhis()) {
      auto *Phi = llvm::cast<InstPhi>(&Instr);
      assert(Phi);
      for (SizeT i = 0; i < InEdges.size(); ++i) {
        Phi->setLabel(i, InEdges[i]);
      }
    }
  }
}

void Cfg::computeInOutEdges() {
  // Compute the out-edges.
  for (CfgNode *Node : Nodes) {
    Node->computeSuccessors();
  }

  // Prune any unreachable nodes before computing in-edges.
  SizeT NumNodes = getNumNodes();
  BitVector Reachable(NumNodes);
  BitVector Pending(NumNodes);
  Pending.set(getEntryNode()->getIndex());
  while (true) {
    int Index = Pending.find_first();
    if (Index == -1)
      break;
    Pending.reset(Index);
    Reachable.set(Index);
    CfgNode *Node = Nodes[Index];
    assert(Node->getIndex() == (SizeT)Index);
    for (CfgNode *Succ : Node->getOutEdges()) {
      SizeT SuccIndex = Succ->getIndex();
      if (!Reachable.test(SuccIndex))
        Pending.set(SuccIndex);
    }
  }
  SizeT Dest = 0;
  for (SizeT Source = 0; Source < NumNodes; ++Source) {
    if (Reachable.test(Source)) {
      Nodes[Dest] = Nodes[Source];
      Nodes[Dest]->resetIndex(Dest);
      // Compute the in-edges.
      Nodes[Dest]->computePredecessors();
      ++Dest;
    }
  }
  Nodes.resize(Dest);

  TimerMarker T(TimerStack::TT_phiValidation, this);
  for (CfgNode *Node : Nodes)
    Node->enforcePhiConsistency();
}

void Cfg::renumberInstructions() {
  TimerMarker T(TimerStack::TT_renumberInstructions, this);
  NextInstNumber = Inst::NumberInitial;
  for (CfgNode *Node : Nodes)
    Node->renumberInstructions();
  // Make sure the entry node is the first node and therefore got the lowest
  // instruction numbers, to facilitate live range computation of function
  // arguments.  We want to model function arguments as being live on entry to
  // the function, otherwise an argument whose only use is in the first
  // instruction will be assigned a trivial live range and the register
  // allocator will not recognize its live range as overlapping another
  // variable's live range.
  assert(Nodes.empty() || (*Nodes.begin() == getEntryNode()));
}

// placePhiLoads() must be called before placePhiStores().
void Cfg::placePhiLoads() {
  TimerMarker T(TimerStack::TT_placePhiLoads, this);
  for (CfgNode *Node : Nodes)
    Node->placePhiLoads();
}

// placePhiStores() must be called after placePhiLoads().
void Cfg::placePhiStores() {
  TimerMarker T(TimerStack::TT_placePhiStores, this);
  for (CfgNode *Node : Nodes)
    Node->placePhiStores();
}

void Cfg::deletePhis() {
  TimerMarker T(TimerStack::TT_deletePhis, this);
  for (CfgNode *Node : Nodes)
    Node->deletePhis();
}

void Cfg::advancedPhiLowering() {
  TimerMarker T(TimerStack::TT_advancedPhiLowering, this);
  // Clear all previously computed live ranges (but not live-in/live-out bit
  // vectors or last-use markers), because the follow-on register allocation is
  // only concerned with live ranges across the newly created blocks.
  for (Variable *Var : Variables) {
    Var->getLiveRange().reset();
  }
  // This splits edges and appends new nodes to the end of the node list. This
  // can invalidate iterators, so don't use an iterator.
  SizeT NumNodes = getNumNodes();
  SizeT NumVars = getNumVariables();
  for (SizeT I = 0; I < NumNodes; ++I)
    Nodes[I]->advancedPhiLowering();

  TimerMarker TT(TimerStack::TT_lowerPhiAssignments, this);
  if (true) {
    // The following code does an in-place update of liveness and live ranges
    // as a result of adding the new phi edge split nodes.
    getLiveness()->initPhiEdgeSplits(Nodes.begin() + NumNodes,
                                     Variables.begin() + NumVars);
    TimerMarker TTT(TimerStack::TT_liveness, this);
    // Iterate over the newly added nodes to add their liveness info.
    for (auto I = Nodes.begin() + NumNodes, E = Nodes.end(); I != E; ++I) {
      InstNumberT FirstInstNum = getNextInstNumber();
      (*I)->renumberInstructions();
      InstNumberT LastInstNum = getNextInstNumber() - 1;
      (*I)->liveness(getLiveness());
      (*I)->livenessAddIntervals(getLiveness(), FirstInstNum, LastInstNum);
    }
  } else {
    // The following code does a brute-force recalculation of live ranges as a
    // result of adding the new phi edge split nodes. The liveness calculation
    // is particularly expensive because the new nodes are not yet in a proper
    // topological order and so convergence is slower.
    //
    // This code is kept here for reference and can be temporarily enabled in
    // case the incremental code is under suspicion.
    renumberInstructions();
    liveness(Liveness_Intervals);
    getVMetadata()->init(VMK_All);
  }
  Target->regAlloc(RAK_Phi);
}

// Find a reasonable placement for nodes that have not yet been placed, while
// maintaining the same relative ordering among already placed nodes.
void Cfg::reorderNodes() {
  // TODO(ascull): it would be nice if the switch tests were always followed by
  // the default case to allow for fall through.
  using PlacedList = CfgList<CfgNode *>;
  PlacedList Placed;      // Nodes with relative placement locked down
  PlacedList Unreachable; // Unreachable nodes
  PlacedList::iterator NoPlace = Placed.end();
  // Keep track of where each node has been tentatively placed so that we can
  // manage insertions into the middle.
  CfgVector<PlacedList::iterator> PlaceIndex(Nodes.size(), NoPlace);
  for (CfgNode *Node : Nodes) {
    // The "do ... while(0);" construct is to factor out the --PlaceIndex and
    // assert() statements before moving to the next node.
    do {
      if (Node != getEntryNode() && Node->getInEdges().empty()) {
        // The node has essentially been deleted since it is not a successor of
        // any other node.
        Unreachable.push_back(Node);
        PlaceIndex[Node->getIndex()] = Unreachable.end();
        Node->setNeedsPlacement(false);
        continue;
      }
      if (!Node->needsPlacement()) {
        // Add to the end of the Placed list.
        Placed.push_back(Node);
        PlaceIndex[Node->getIndex()] = Placed.end();
        continue;
      }
      Node->setNeedsPlacement(false);
      // Assume for now that the unplaced node is from edge-splitting and
      // therefore has 1 in-edge and 1 out-edge (actually, possibly more than 1
      // in-edge if the predecessor node was contracted). If this changes in
      // the future, rethink the strategy.
      assert(Node->getInEdges().size() >= 1);
      assert(Node->hasSingleOutEdge());

      // If it's a (non-critical) edge where the successor has a single
      // in-edge, then place it before the successor.
      CfgNode *Succ = Node->getOutEdges().front();
      if (Succ->getInEdges().size() == 1 &&
          PlaceIndex[Succ->getIndex()] != NoPlace) {
        Placed.insert(PlaceIndex[Succ->getIndex()], Node);
        PlaceIndex[Node->getIndex()] = PlaceIndex[Succ->getIndex()];
        continue;
      }

      // Otherwise, place it after the (first) predecessor.
      CfgNode *Pred = Node->getInEdges().front();
      auto PredPosition = PlaceIndex[Pred->getIndex()];
      // It shouldn't be the case that PredPosition==NoPlace, but if that
      // somehow turns out to be true, we just insert Node before
      // PredPosition=NoPlace=Placed.end() .
      if (PredPosition != NoPlace)
        ++PredPosition;
      Placed.insert(PredPosition, Node);
      PlaceIndex[Node->getIndex()] = PredPosition;
    } while (0);

    --PlaceIndex[Node->getIndex()];
    assert(*PlaceIndex[Node->getIndex()] == Node);
  }

  // Reorder Nodes according to the built-up lists.
  NodeList Reordered;
  Reordered.reserve(Placed.size() + Unreachable.size());
  for (CfgNode *Node : Placed)
    Reordered.push_back(Node);
  for (CfgNode *Node : Unreachable)
    Reordered.push_back(Node);
  assert(getNumNodes() == Reordered.size());
  swapNodes(Reordered);
}

namespace {
void getRandomPostOrder(CfgNode *Node, BitVector &ToVisit,
                        Ice::NodeList &PostOrder,
                        Ice::RandomNumberGenerator *RNG) {
  assert(ToVisit[Node->getIndex()]);
  ToVisit[Node->getIndex()] = false;
  NodeList Outs = Node->getOutEdges();
  Ice::RandomShuffle(Outs.begin(), Outs.end(),
                     [RNG](int N) { return RNG->next(N); });
  for (CfgNode *Next : Outs) {
    if (ToVisit[Next->getIndex()])
      getRandomPostOrder(Next, ToVisit, PostOrder, RNG);
  }
  PostOrder.push_back(Node);
}
} // end of anonymous namespace

void Cfg::shuffleNodes() {
  if (!getFlags().getReorderBasicBlocks())
    return;

  NodeList ReversedReachable;
  NodeList Unreachable;
  BitVector ToVisit(Nodes.size(), true);
  // Create Random number generator for function reordering
  RandomNumberGenerator RNG(getFlags().getRandomSeed(),
                            RPE_BasicBlockReordering, SequenceNumber);
  // Traverse from entry node.
  getRandomPostOrder(getEntryNode(), ToVisit, ReversedReachable, &RNG);
  // Collect the unreachable nodes.
  for (CfgNode *Node : Nodes)
    if (ToVisit[Node->getIndex()])
      Unreachable.push_back(Node);
  // Copy the layout list to the Nodes.
  NodeList Shuffled;
  Shuffled.reserve(ReversedReachable.size() + Unreachable.size());
  for (CfgNode *Node : reverse_range(ReversedReachable))
    Shuffled.push_back(Node);
  for (CfgNode *Node : Unreachable)
    Shuffled.push_back(Node);
  assert(Nodes.size() == Shuffled.size());
  swapNodes(Shuffled);

  dump("After basic block shuffling");
}

void Cfg::localCSE(bool AssumeSSA) {
  // Performs basic-block local common-subexpression elimination
  // If we have
  // t1 = op b c
  // t2 = op b c
  // This pass will replace future references to t2 in a basic block by t1
  // Points to note:
  // 1. Assumes SSA by default. To change this, use -lcse=no-ssa
  //      This is needed if this pass is moved to a point later in the pipeline.
  //      If variables have a single definition (in the node), CSE can work just
  //      on the basis of an equality compare on instructions (sans Dest). When
  //      variables can be updated (hence, non-SSA) the result of a previous
  //      instruction which used that variable as an operand can not be reused.
  // 2. Leaves removal of instructions to DCE.
  // 3. Only enabled on arithmetic instructions. pnacl-clang (-O2) is expected
  //    to take care of cases not arising from GEP simplification.
  // 4. By default, a single pass is made over each basic block. Control this
  //    with -lcse-max-iters=N

  TimerMarker T(TimerStack::TT_localCse, this);
  struct VariableHash {
    size_t operator()(const Variable *Var) const { return Var->hashValue(); }
  };

  struct InstHash {
    size_t operator()(const Inst *Instr) const {
      auto Kind = Instr->getKind();
      auto Result =
          std::hash<typename std::underlying_type<Inst::InstKind>::type>()(
              Kind);
      for (SizeT i = 0; i < Instr->getSrcSize(); ++i) {
        Result ^= Instr->getSrc(i)->hashValue();
      }
      return Result;
    }
  };
  struct InstEq {
    bool srcEq(const Operand *A, const Operand *B) const {
      if (llvm::isa<Variable>(A) || llvm::isa<Constant>(A))
        return (A == B);
      return false;
    }
    bool operator()(const Inst *InstrA, const Inst *InstrB) const {
      if ((InstrA->getKind() != InstrB->getKind()) ||
          (InstrA->getSrcSize() != InstrB->getSrcSize()))
        return false;

      if (auto *A = llvm::dyn_cast<InstArithmetic>(InstrA)) {
        auto *B = llvm::cast<InstArithmetic>(InstrB);
        // A, B are guaranteed to be of the same 'kind' at this point
        // So, dyn_cast is not needed
        if (A->getOp() != B->getOp())
          return false;
      }
      // Does not enter loop if different kind or number of operands
      for (SizeT i = 0; i < InstrA->getSrcSize(); ++i) {
        if (!srcEq(InstrA->getSrc(i), InstrB->getSrc(i)))
          return false;
      }
      return true;
    }
  };

  for (CfgNode *Node : getNodes()) {
    CfgUnorderedSet<Inst *, InstHash, InstEq> Seen;
    // Stores currently available instructions.

    CfgUnorderedMap<Variable *, Variable *, VariableHash> Replacements;
    // Combining the above two into a single data structure might consume less
    // memory but will be slower i.e map of Instruction -> Set of Variables

    CfgUnorderedMap<Variable *, std::vector<Inst *>, VariableHash> Dependency;
    // Maps a variable to the Instructions that depend on it.
    // a = op1 b c
    // x = op2 c d
    // Will result in the map : b -> {a}, c -> {a, x}, d -> {x}
    // Not necessary for SSA as dependencies will never be invalidated, and the
    // container will use minimal memory when left unused.

    auto IterCount = getFlags().getLocalCseMaxIterations();

    for (uint32_t i = 0; i < IterCount; ++i) {
      // TODO(manasijm): Stats on IterCount -> performance
      for (Inst &Instr : Node->getInsts()) {
        if (Instr.isDeleted() || !llvm::isa<InstArithmetic>(&Instr))
          continue;
        if (!AssumeSSA) {
          // Invalidate replacements
          auto Iter = Replacements.find(Instr.getDest());
          if (Iter != Replacements.end()) {
            Replacements.erase(Iter);
          }

          // Invalidate 'seen' instructions whose operands were just updated.
          auto DepIter = Dependency.find(Instr.getDest());
          if (DepIter != Dependency.end()) {
            for (auto *DepInst : DepIter->second) {
              Seen.erase(DepInst);
            }
          }
        }

        // Replace - doing this before checking for repetitions might enable
        // more optimizations
        for (SizeT i = 0; i < Instr.getSrcSize(); ++i) {
          auto *Opnd = Instr.getSrc(i);
          if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) {
            if (Replacements.find(Var) != Replacements.end()) {
              Instr.replaceSource(i, Replacements[Var]);
            }
          }
        }

        // Check for repetitions
        auto SeenIter = Seen.find(&Instr);
        if (SeenIter != Seen.end()) { // seen before
          const Inst *Found = *SeenIter;
          Replacements[Instr.getDest()] = Found->getDest();
        } else { // new
          Seen.insert(&Instr);

          if (!AssumeSSA) {
            // Update dependencies
            for (SizeT i = 0; i < Instr.getSrcSize(); ++i) {
              auto *Opnd = Instr.getSrc(i);
              if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) {
                Dependency[Var].push_back(&Instr);
              }
            }
          }
        }
      }
    }
  }
}

void Cfg::loopInvariantCodeMotion() {
  TimerMarker T(TimerStack::TT_loopInvariantCodeMotion, this);
  // Does not introduce new nodes as of now.
  for (auto &Loop : LoopInfo) {
    CfgNode *Header = Loop.Header;
    assert(Header);
    if (Header->getLoopNestDepth() < 1)
      return;
    CfgNode *PreHeader = Loop.PreHeader;
    if (PreHeader == nullptr || PreHeader->getInsts().size() == 0) {
      return; // try next loop
    }

    auto &Insts = PreHeader->getInsts();
    auto &LastInst = Insts.back();
    Insts.pop_back();

    for (auto *Inst : findLoopInvariantInstructions(Loop.Body)) {
      PreHeader->appendInst(Inst);
    }
    PreHeader->appendInst(&LastInst);
  }
}

CfgVector<Inst *>
Cfg::findLoopInvariantInstructions(const CfgUnorderedSet<SizeT> &Body) {
  CfgUnorderedSet<Inst *> InvariantInsts;
  CfgUnorderedSet<Variable *> InvariantVars;
  for (auto *Var : getArgs()) {
    InvariantVars.insert(Var);
  }
  bool Changed = false;
  do {
    Changed = false;
    for (auto NodeIndex : Body) {
      auto *Node = Nodes[NodeIndex];
      CfgVector<std::reference_wrapper<Inst>> Insts(Node->getInsts().begin(),
                                                    Node->getInsts().end());

      for (auto &InstRef : Insts) {
        auto &Inst = InstRef.get();
        if (Inst.isDeleted() ||
            InvariantInsts.find(&Inst) != InvariantInsts.end())
          continue;
        switch (Inst.getKind()) {
        case Inst::InstKind::Alloca:
        case Inst::InstKind::Br:
        case Inst::InstKind::Ret:
        case Inst::InstKind::Phi:
        case Inst::InstKind::Call:
        case Inst::InstKind::IntrinsicCall:
        case Inst::InstKind::Load:
        case Inst::InstKind::Store:
        case Inst::InstKind::Switch:
          continue;
        default:
          break;
        }

        bool IsInvariant = true;
        for (SizeT i = 0; i < Inst.getSrcSize(); ++i) {
          if (auto *Var = llvm::dyn_cast<Variable>(Inst.getSrc(i))) {
            if (InvariantVars.find(Var) == InvariantVars.end()) {
              IsInvariant = false;
            }
          }
        }
        if (IsInvariant) {
          Changed = true;
          InvariantInsts.insert(&Inst);
          Node->getInsts().remove(Inst);
          if (Inst.getDest() != nullptr) {
            InvariantVars.insert(Inst.getDest());
          }
        }
      }
    }
  } while (Changed);

  CfgVector<Inst *> InstVector(InvariantInsts.begin(), InvariantInsts.end());
  std::sort(InstVector.begin(), InstVector.end(),
            [](Inst *A, Inst *B) { return A->getNumber() < B->getNumber(); });
  return InstVector;
}

void Cfg::shortCircuitJumps() {
  // Split Nodes whenever an early jump is possible.
  // __N :
  //   a = <something>
  //   Instruction 1 without side effect
  //   ... b = <something> ...
  //   Instruction N without side effect
  //   t1 = or a b
  //   br t1 __X __Y
  //
  // is transformed into:
  // __N :
  //   a = <something>
  //   br a __X __N_ext
  //
  // __N_ext :
  //   Instruction 1 without side effect
  //   ... b = <something> ...
  //   Instruction N without side effect
  //   br b __X __Y
  // (Similar logic for AND, jump to false instead of true target.)

  TimerMarker T(TimerStack::TT_shortCircuit, this);
  getVMetadata()->init(VMK_Uses);
  auto NodeStack = this->getNodes();
  CfgUnorderedMap<SizeT, CfgVector<CfgNode *>> Splits;
  while (!NodeStack.empty()) {
    auto *Node = NodeStack.back();
    NodeStack.pop_back();
    auto NewNode = Node->shortCircuit();
    if (NewNode) {
      NodeStack.push_back(NewNode);
      NodeStack.push_back(Node);
      Splits[Node->getIndex()].push_back(NewNode);
    }
  }

  // Insert nodes in the right place
  NodeList NewList;
  NewList.reserve(Nodes.size());
  CfgUnorderedSet<SizeT> Inserted;
  for (auto *Node : Nodes) {
    if (Inserted.find(Node->getIndex()) != Inserted.end())
      continue; // already inserted
    NodeList Stack{Node};
    while (!Stack.empty()) {
      auto *Current = Stack.back();
      Stack.pop_back();
      Inserted.insert(Current->getIndex());
      NewList.push_back(Current);
      for (auto *Next : Splits[Current->getIndex()]) {
        Stack.push_back(Next);
      }
    }
  }

  SizeT NodeIndex = 0;
  for (auto *Node : NewList) {
    Node->resetIndex(NodeIndex++);
  }
  Nodes = NewList;
}

void Cfg::floatConstantCSE() {
  // Load multiple uses of a floating point constant (between two call
  // instructions or block start/end) into a variable before its first use.
  //   t1 = b + 1.0
  //   t2 = c + 1.0
  // Gets transformed to:
  //   t0 = 1.0
  //   t0_1 = t0
  //   t1 = b + t0_1
  //   t2 = c + t0_1
  // Call instructions reset the procedure, but use the same variable, just in
  // case it got a register. We are assuming floating point registers are not
  // callee saved in general. Example, continuing from before:
  //   result = call <some function>
  //   t3 = d + 1.0
  // Gets transformed to:
  //   result = call <some function>
  //   t0_2 = t0
  //   t3 = d + t0_2
  // TODO(manasijm, stichnot): Figure out how to 'link' t0 to the stack slot of
  // 1.0. When t0 does not get a register, introducing an extra assignment
  // statement does not make sense. The relevant portion is marked below.

  TimerMarker _(TimerStack::TT_floatConstantCse, this);
  for (CfgNode *Node : getNodes()) {

    CfgUnorderedMap<Constant *, Variable *> ConstCache;
    auto Current = Node->getInsts().begin();
    auto End = Node->getInsts().end();
    while (Current != End) {
      CfgUnorderedMap<Constant *, CfgVector<InstList::iterator>> FloatUses;
      if (llvm::isa<InstCall>(iteratorToInst(Current))) {
        ++Current;
        assert(Current != End);
        // Block should not end with a call
      }
      while (Current != End && !llvm::isa<InstCall>(iteratorToInst(Current))) {
        if (!Current->isDeleted()) {
          for (SizeT i = 0; i < Current->getSrcSize(); ++i) {
            if (auto *Const = llvm::dyn_cast<Constant>(Current->getSrc(i))) {
              if (Const->getType() == IceType_f32 ||
                  Const->getType() == IceType_f64) {
                FloatUses[Const].push_back(Current);
              }
            }
          }
        }
        ++Current;
      }
      for (auto &Pair : FloatUses) {
        static constexpr SizeT MinUseThreshold = 3;
        if (Pair.second.size() < MinUseThreshold)
          continue;
        // Only consider constants with at least `MinUseThreshold` uses
        auto &Insts = Node->getInsts();

        if (ConstCache.find(Pair.first) == ConstCache.end()) {
          // Saw a constant (which is used at least twice) for the first time
          auto *NewVar = makeVariable(Pair.first->getType());
          // NewVar->setLinkedTo(Pair.first);
          // TODO(manasijm): Plumbing for linking to an Operand.
          auto *Assign = InstAssign::create(Node->getCfg(), NewVar, Pair.first);
          Insts.insert(Pair.second[0], Assign);
          ConstCache[Pair.first] = NewVar;
        }

        auto *NewVar = makeVariable(Pair.first->getType());
        NewVar->setLinkedTo(ConstCache[Pair.first]);
        auto *Assign =
            InstAssign::create(Node->getCfg(), NewVar, ConstCache[Pair.first]);

        Insts.insert(Pair.second[0], Assign);
        for (auto InstUse : Pair.second) {
          for (SizeT i = 0; i < InstUse->getSrcSize(); ++i) {
            if (auto *Const = llvm::dyn_cast<Constant>(InstUse->getSrc(i))) {
              if (Const == Pair.first) {
                InstUse->replaceSource(i, NewVar);
              }
            }
          }
        }
      }
    }
  }
}

void Cfg::doArgLowering() {
  TimerMarker T(TimerStack::TT_doArgLowering, this);
  getTarget()->lowerArguments();
}

void Cfg::sortAndCombineAllocas(CfgVector<InstAlloca *> &Allocas,
                                uint32_t CombinedAlignment, InstList &Insts,
                                AllocaBaseVariableType BaseVariableType) {
  if (Allocas.empty())
    return;
  // Sort by decreasing alignment.
  std::sort(Allocas.begin(), Allocas.end(), [](InstAlloca *A1, InstAlloca *A2) {
    uint32_t Align1 = A1->getAlignInBytes();
    uint32_t Align2 = A2->getAlignInBytes();
    if (Align1 == Align2)
      return A1->getNumber() < A2->getNumber();
    else
      return Align1 > Align2;
  });
  // Process the allocas in order of decreasing stack alignment.  This allows
  // us to pack less-aligned pieces after more-aligned ones, resulting in less
  // stack growth.  It also allows there to be at most one stack alignment "and"
  // instruction for a whole list of allocas.
  uint32_t CurrentOffset = 0;
  CfgVector<int32_t> Offsets;
  for (Inst *Instr : Allocas) {
    auto *Alloca = llvm::cast<InstAlloca>(Instr);
    // Adjust the size of the allocation up to the next multiple of the
    // object's alignment.
    uint32_t Alignment = std::max(Alloca->getAlignInBytes(), 1u);
    auto *ConstSize =
        llvm::dyn_cast<ConstantInteger32>(Alloca->getSizeInBytes());
    uint32_t Size = Utils::applyAlignment(ConstSize->getValue(), Alignment);
    if (BaseVariableType == BVT_FramePointer) {
      // Addressing is relative to the frame pointer.  Subtract the offset after
      // adding the size of the alloca, because it grows downwards from the
      // frame pointer.
      Offsets.push_back(Target->getFramePointerOffset(CurrentOffset, Size));
    } else {
      // Addressing is relative to the stack pointer or to a user pointer.  Add
      // the offset before adding the size of the object, because it grows
      // upwards from the stack pointer. In addition, if the addressing is
      // relative to the stack pointer, we need to add the pre-computed max out
      // args size bytes.
      const uint32_t OutArgsOffsetOrZero =
          (BaseVariableType == BVT_StackPointer)
              ? getTarget()->maxOutArgsSizeBytes()
              : 0;
      Offsets.push_back(CurrentOffset + OutArgsOffsetOrZero);
    }
    // Update the running offset of the fused alloca region.
    CurrentOffset += Size;
  }
  // Round the offset up to the alignment granularity to use as the size.
  uint32_t TotalSize = Utils::applyAlignment(CurrentOffset, CombinedAlignment);
  // Ensure every alloca was assigned an offset.
  assert(Allocas.size() == Offsets.size());

  switch (BaseVariableType) {
  case BVT_UserPointer: {
    Variable *BaseVariable = makeVariable(IceType_i32);
    for (SizeT i = 0; i < Allocas.size(); ++i) {
      auto *Alloca = llvm::cast<InstAlloca>(Allocas[i]);
      // Emit a new addition operation to replace the alloca.
      Operand *AllocaOffset = Ctx->getConstantInt32(Offsets[i]);
      InstArithmetic *Add =
          InstArithmetic::create(this, InstArithmetic::Add, Alloca->getDest(),
                                 BaseVariable, AllocaOffset);
      Insts.push_front(Add);
      Alloca->setDeleted();
    }
    Operand *AllocaSize = Ctx->getConstantInt32(TotalSize);
    InstAlloca *CombinedAlloca =
        InstAlloca::create(this, BaseVariable, AllocaSize, CombinedAlignment);
    CombinedAlloca->setKnownFrameOffset();
    Insts.push_front(CombinedAlloca);
  } break;
  case BVT_StackPointer:
  case BVT_FramePointer: {
    for (SizeT i = 0; i < Allocas.size(); ++i) {
      auto *Alloca = llvm::cast<InstAlloca>(Allocas[i]);
      // Emit a fake definition of the rematerializable variable.
      Variable *Dest = Alloca->getDest();
      auto *Def = InstFakeDef::create(this, Dest);
      if (BaseVariableType == BVT_StackPointer)
        Dest->setRematerializable(getTarget()->getStackReg(), Offsets[i]);
      else
        Dest->setRematerializable(getTarget()->getFrameReg(), Offsets[i]);
      Insts.push_front(Def);
      Alloca->setDeleted();
    }
    // Allocate the fixed area in the function prolog.
    getTarget()->reserveFixedAllocaArea(TotalSize, CombinedAlignment);
  } break;
  }
}

void Cfg::processAllocas(bool SortAndCombine) {
  TimerMarker _(TimerStack::TT_alloca, this);
  const uint32_t StackAlignment = getTarget()->getStackAlignment();
  CfgNode *EntryNode = getEntryNode();
  assert(EntryNode);
  // LLVM enforces power of 2 alignment.
  assert(llvm::isPowerOf2_32(StackAlignment));
  // If the ABI's stack alignment is smaller than the vector size (16 bytes),
  // conservatively use a frame pointer to allow for explicit alignment of the
  // stack pointer. This needs to happen before register allocation so the frame
  // pointer can be reserved.
  if (getTarget()->needsStackPointerAlignment()) {
    getTarget()->setHasFramePointer();
  }
  // Determine if there are large alignment allocations in the entry block or
  // dynamic allocations (variable size in the entry block).
  bool HasLargeAlignment = false;
  bool HasDynamicAllocation = false;
  for (Inst &Instr : EntryNode->getInsts()) {
    if (Instr.isDeleted())
      continue;
    if (auto *Alloca = llvm::dyn_cast<InstAlloca>(&Instr)) {
      uint32_t AlignmentParam = Alloca->getAlignInBytes();
      if (AlignmentParam > StackAlignment)
        HasLargeAlignment = true;
      if (llvm::isa<Constant>(Alloca->getSizeInBytes()))
        Alloca->setKnownFrameOffset();
      else {
        HasDynamicAllocation = true;
        // If Allocas are not sorted, the first dynamic allocation causes
        // later Allocas to be at unknown offsets relative to the stack/frame.
        if (!SortAndCombine)
          break;
      }
    }
  }
  // Don't do the heavyweight sorting and layout for low optimization levels.
  if (!SortAndCombine)
    return;
  // Any alloca outside the entry block is a dynamic allocation.
  for (CfgNode *Node : Nodes) {
    if (Node == EntryNode)
      continue;
    for (Inst &Instr : Node->getInsts()) {
      if (Instr.isDeleted())
        continue;
      if (llvm::isa<InstAlloca>(&Instr)) {
        // Allocations outside the entry block require a frame pointer.
        HasDynamicAllocation = true;
        break;
      }
    }
    if (HasDynamicAllocation)
      break;
  }
  // Mark the target as requiring a frame pointer.
  if (HasLargeAlignment || HasDynamicAllocation)
    getTarget()->setHasFramePointer();
  // Collect the Allocas into the two vectors.
  // Allocas in the entry block that have constant size and alignment less
  // than or equal to the function's stack alignment.
  CfgVector<InstAlloca *> FixedAllocas;
  // Allocas in the entry block that have constant size and alignment greater
  // than the function's stack alignment.
  CfgVector<InstAlloca *> AlignedAllocas;
  // Maximum alignment used by any alloca.
  uint32_t MaxAlignment = StackAlignment;
  for (Inst &Instr : EntryNode->getInsts()) {
    if (Instr.isDeleted())
      continue;
    if (auto *Alloca = llvm::dyn_cast<InstAlloca>(&Instr)) {
      if (!llvm::isa<Constant>(Alloca->getSizeInBytes()))
        continue;
      uint32_t AlignmentParam = Alloca->getAlignInBytes();
      // For default align=0, set it to the real value 1, to avoid any
      // bit-manipulation problems below.
      AlignmentParam = std::max(AlignmentParam, 1u);
      assert(llvm::isPowerOf2_32(AlignmentParam));
      if (HasDynamicAllocation && AlignmentParam > StackAlignment) {
        // If we have both dynamic allocations and large stack alignments,
        // high-alignment allocations are pulled out with their own base.
        AlignedAllocas.push_back(Alloca);
      } else {
        FixedAllocas.push_back(Alloca);
      }
      MaxAlignment = std::max(AlignmentParam, MaxAlignment);
    }
  }
  // Add instructions to the head of the entry block in reverse order.
  InstList &Insts = getEntryNode()->getInsts();
  if (HasDynamicAllocation && HasLargeAlignment) {
    // We are using a frame pointer, but fixed large-alignment alloca addresses
    // do not have a known offset from either the stack or frame pointer.
    // They grow up from a user pointer from an alloca.
    sortAndCombineAllocas(AlignedAllocas, MaxAlignment, Insts, BVT_UserPointer);
    // Fixed size allocas are addressed relative to the frame pointer.
    sortAndCombineAllocas(FixedAllocas, StackAlignment, Insts,
                          BVT_FramePointer);
  } else {
    // Otherwise, fixed size allocas are addressed relative to the stack unless
    // there are dynamic allocas.
    const AllocaBaseVariableType BasePointerType =
        (HasDynamicAllocation ? BVT_FramePointer : BVT_StackPointer);
    sortAndCombineAllocas(FixedAllocas, MaxAlignment, Insts, BasePointerType);
  }
  if (!FixedAllocas.empty() || !AlignedAllocas.empty())
    // No use calling findRematerializable() unless there is some
    // rematerializable alloca instruction to seed it.
    findRematerializable();
}

namespace {

// Helpers for findRematerializable().  For each of them, if a suitable
// rematerialization is found, the instruction's Dest variable is set to be
// rematerializable and it returns true, otherwise it returns false.

bool rematerializeArithmetic(const Inst *Instr) {
  // Check that it's an Arithmetic instruction with an Add operation.
  auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr);
  if (Arith == nullptr || Arith->getOp() != InstArithmetic::Add)
    return false;
  // Check that Src(0) is rematerializable.
  auto *Src0Var = llvm::dyn_cast<Variable>(Arith->getSrc(0));
  if (Src0Var == nullptr || !Src0Var->isRematerializable())
    return false;
  // Check that Src(1) is an immediate.
  auto *Src1Imm = llvm::dyn_cast<ConstantInteger32>(Arith->getSrc(1));
  if (Src1Imm == nullptr)
    return false;
  Arith->getDest()->setRematerializable(
      Src0Var->getRegNum(), Src0Var->getStackOffset() + Src1Imm->getValue());
  return true;
}

bool rematerializeAssign(const Inst *Instr) {
  // An InstAssign only originates from an inttoptr or ptrtoint instruction,
  // which never occurs in a MINIMAL build.
  if (BuildDefs::minimal())
    return false;
  // Check that it's an Assign instruction.
  if (!llvm::isa<InstAssign>(Instr))
    return false;
  // Check that Src(0) is rematerializable.
  auto *Src0Var = llvm::dyn_cast<Variable>(Instr->getSrc(0));
  if (Src0Var == nullptr || !Src0Var->isRematerializable())
    return false;
  Instr->getDest()->setRematerializable(Src0Var->getRegNum(),
                                        Src0Var->getStackOffset());
  return true;
}

bool rematerializeCast(const Inst *Instr) {
  // An pointer-type bitcast never occurs in a MINIMAL build.
  if (BuildDefs::minimal())
    return false;
  // Check that it's a Cast instruction with a Bitcast operation.
  auto *Cast = llvm::dyn_cast<InstCast>(Instr);
  if (Cast == nullptr || Cast->getCastKind() != InstCast::Bitcast)
    return false;
  // Check that Src(0) is rematerializable.
  auto *Src0Var = llvm::dyn_cast<Variable>(Cast->getSrc(0));
  if (Src0Var == nullptr || !Src0Var->isRematerializable())
    return false;
  // Check that Dest and Src(0) have the same type.
  Variable *Dest = Cast->getDest();
  if (Dest->getType() != Src0Var->getType())
    return false;
  Dest->setRematerializable(Src0Var->getRegNum(), Src0Var->getStackOffset());
  return true;
}

} // end of anonymous namespace

/// Scan the function to find additional rematerializable variables.  This is
/// possible when the source operand of an InstAssignment is a rematerializable
/// variable, or the same for a pointer-type InstCast::Bitcast, or when an
/// InstArithmetic is an add of a rematerializable variable and an immediate.
/// Note that InstAssignment instructions and pointer-type InstCast::Bitcast
/// instructions generally only come about from the IceConverter's treatment of
/// inttoptr, ptrtoint, and bitcast instructions.  TODO(stichnot): Consider
/// other possibilities, however unlikely, such as InstArithmetic::Sub, or
/// commutativity.
void Cfg::findRematerializable() {
  // Scan the instructions in order, and repeat until no new opportunities are
  // found.  It may take more than one iteration because a variable's defining
  // block may happen to come after a block where it is used, depending on the
  // CfgNode linearization order.
  bool FoundNewAssignment;
  do {
    FoundNewAssignment = false;
    for (CfgNode *Node : getNodes()) {
      // No need to process Phi instructions.
      for (Inst &Instr : Node->getInsts()) {
        if (Instr.isDeleted())
          continue;
        Variable *Dest = Instr.getDest();
        if (Dest == nullptr || Dest->isRematerializable())
          continue;
        if (rematerializeArithmetic(&Instr) || rematerializeAssign(&Instr) ||
            rematerializeCast(&Instr)) {
          FoundNewAssignment = true;
        }
      }
    }
  } while (FoundNewAssignment);
}

void Cfg::doAddressOpt() {
  TimerMarker T(TimerStack::TT_doAddressOpt, this);
  for (CfgNode *Node : Nodes)
    Node->doAddressOpt();
}

namespace {
// ShuffleVectorUtils implements helper functions for rematerializing
// shufflevector instructions from a sequence of extractelement/insertelement
// instructions. It looks for the following pattern:
//
// %t0 = extractelement A, %n0
// %t1 = extractelement B, %n1
// %t2 = extractelement C, %n2
// ...
// %tN = extractelement N, %nN
// %d0 = insertelement undef, %t0, 0
// %d1 = insertelement %d0, %t1, 1
// %d2 = insertelement %d1, %t2, 2
// ...
// %dest = insertelement %d_N-1, %tN, N
//
// where N is num_element(typeof(%dest)), and A, B, C, ... N are at most two
// distinct variables.
namespace ShuffleVectorUtils {
// findAllInserts is used when searching for all the insertelements that are
// used in a shufflevector operation. This function works recursively, when
// invoked with I = i, the function assumes Insts[i] is the last found
// insertelement in the chain. The next insertelement insertruction is saved in
// Insts[i+1].
bool findAllInserts(Cfg *Func, GlobalContext *Ctx, VariablesMetadata *VM,
                    CfgVector<const Inst *> *Insts, SizeT I = 0) {
  const bool Verbose = BuildDefs::dump() && Func->isVerbose(IceV_ShufMat);

  if (I > Insts->size()) {
    if (Verbose) {
      Ctx->getStrDump() << "\tToo many inserts.\n";
    }
    return false;
  }

  const auto *LastInsert = Insts->at(I);
  assert(llvm::isa<InstInsertElement>(LastInsert));

  if (I == Insts->size() - 1) {
    // Matching against undef is not really needed because the value in Src(0)
    // will be totally overwritten. We still enforce it anyways because the
    // PNaCl toolchain generates the bitcode with it.
    if (!llvm::isa<ConstantUndef>(LastInsert->getSrc(0))) {
      if (Verbose) {
        Ctx->getStrDump() << "\tSrc0 is not undef: " << I << " "
                          << Insts->size();
        LastInsert->dump(Func);
        Ctx->getStrDump() << "\n";
      }
      return false;
    }

    // The following loop ensures that the insertelements are sorted. In theory,
    // we could relax this restriction and allow any order. As long as each
    // index appears exactly once, this chain is still a candidate for becoming
    // a shufflevector. The Insts vector is traversed backwards because the
    // instructions are "enqueued" in reverse order.
    int32_t ExpectedElement = 0;
    for (const auto *I : reverse_range(*Insts)) {
      if (llvm::cast<ConstantInteger32>(I->getSrc(2))->getValue() !=
          ExpectedElement) {
        return false;
      }
      ++ExpectedElement;
    }
    return true;
  }

  const auto *Src0V = llvm::cast<Variable>(LastInsert->getSrc(0));
  const auto *Def = VM->getSingleDefinition(Src0V);

  // Only optimize if the first operand in
  //
  //   Dest = insertelement A, B, 10
  //
  // is singly-def'ed.
  if (Def == nullptr) {
    if (Verbose) {
      Ctx->getStrDump() << "\tmulti-def: ";
      (*Insts)[I]->dump(Func);
      Ctx->getStrDump() << "\n";
    }
    return false;
  }

  // We also require the (single) definition to come from an insertelement
  // instruction.
  if (!llvm::isa<InstInsertElement>(Def)) {
    if (Verbose) {
      Ctx->getStrDump() << "\tnot insert element: ";
      Def->dump(Func);
      Ctx->getStrDump() << "\n";
    }
    return false;
  }

  // Everything seems fine, so we save Def in Insts, and delegate the decision
  // to findAllInserts.
  (*Insts)[I + 1] = Def;

  return findAllInserts(Func, Ctx, VM, Insts, I + 1);
}

// insertsLastElement returns true if Insert is inserting an element in the last
// position of a vector.
bool insertsLastElement(const Inst &Insert) {
  const Type DestTy = Insert.getDest()->getType();
  assert(isVectorType(DestTy));
  const SizeT Elem =
      llvm::cast<ConstantInteger32>(Insert.getSrc(2))->getValue();
  return Elem == typeNumElements(DestTy) - 1;
}

// findAllExtracts goes over all the insertelement instructions that are
// candidates to be replaced by a shufflevector, and searches for all the
// definitions of the elements being inserted. If all of the elements are the
// result of an extractelement instruction, and all of the extractelements
// operate on at most two different sources, than the instructions can be
// replaced by a shufflevector.
bool findAllExtracts(Cfg *Func, GlobalContext *Ctx, VariablesMetadata *VM,
                     const CfgVector<const Inst *> &Insts, Variable **Src0,
                     Variable **Src1, CfgVector<const Inst *> *Extracts) {
  const bool Verbose = BuildDefs::dump() && Func->isVerbose(IceV_ShufMat);

  *Src0 = nullptr;
  *Src1 = nullptr;
  assert(Insts.size() > 0);
  for (SizeT I = 0; I < Insts.size(); ++I) {
    const auto *Insert = Insts.at(I);
    const auto *Src1V = llvm::dyn_cast<Variable>(Insert->getSrc(1));
    if (Src1V == nullptr) {
      if (Verbose) {
        Ctx->getStrDump() << "src(1) is not a variable: ";
        Insert->dump(Func);
        Ctx->getStrDump() << "\n";
      }
      return false;
    }

    const auto *Def = VM->getSingleDefinition(Src1V);
    if (Def == nullptr) {
      if (Verbose) {
        Ctx->getStrDump() << "multi-def src(1): ";
        Insert->dump(Func);
        Ctx->getStrDump() << "\n";
      }
      return false;
    }

    if (!llvm::isa<InstExtractElement>(Def)) {
      if (Verbose) {
        Ctx->getStrDump() << "not extractelement: ";
        Def->dump(Func);
        Ctx->getStrDump() << "\n";
      }
      return false;
    }

    auto *Src = llvm::cast<Variable>(Def->getSrc(0));
    if (*Src0 == nullptr) {
      // No sources yet. Save Src to Src0.
      *Src0 = Src;
    } else if (*Src1 == nullptr) {
      // We already have a source, so we might save Src in Src1 -- but only if
      // Src0 is not Src.
      if (*Src0 != Src) {
        *Src1 = Src;
      }
    } else if (Src != *Src0 && Src != *Src1) {
      // More than two sources, so we can't rematerialize the shufflevector
      // instruction.
      if (Verbose) {
        Ctx->getStrDump() << "Can't shuffle more than two sources.\n";
      }
      return false;
    }

    (*Extracts)[I] = Def;
  }

  // We should have seen at least one source operand.
  assert(*Src0 != nullptr);

  // If a second source was not seen, then we just make Src1 = Src0 to simplify
  // things down stream. This should not matter, as all of the indexes in the
  // shufflevector instruction will point to Src0.
  if (*Src1 == nullptr) {
    *Src1 = *Src0;
  }

  return true;
}

} // end of namespace ShuffleVectorUtils
} // end of anonymous namespace

void Cfg::materializeVectorShuffles() {
  const bool Verbose = BuildDefs::dump() && isVerbose(IceV_ShufMat);

  std::unique_ptr<OstreamLocker> L;
  if (Verbose) {
    L.reset(new OstreamLocker(getContext()));
    getContext()->getStrDump() << "\nShuffle materialization:\n";
  }

  // MaxVectorElements is the maximum number of elements in the vector types
  // handled by Subzero. We use it to create the Inserts and Extracts vectors
  // with the appropriate size, thus avoiding resize() calls.
  const SizeT MaxVectorElements = typeNumElements(IceType_v16i8);
  CfgVector<const Inst *> Inserts(MaxVectorElements);
  CfgVector<const Inst *> Extracts(MaxVectorElements);

  TimerMarker T(TimerStack::TT_materializeVectorShuffles, this);
  for (CfgNode *Node : Nodes) {
    for (auto &Instr : Node->getInsts()) {
      if (!llvm::isa<InstInsertElement>(Instr)) {
        continue;
      }
      if (!ShuffleVectorUtils::insertsLastElement(Instr)) {
        // To avoid wasting time, we only start the pattern match at the last
        // insertelement instruction -- and go backwards from there.
        continue;
      }
      if (Verbose) {
        getContext()->getStrDump() << "\tCandidate: ";
        Instr.dump(this);
        getContext()->getStrDump() << "\n";
      }
      Inserts.resize(typeNumElements(Instr.getDest()->getType()));
      Inserts[0] = &Instr;
      if (!ShuffleVectorUtils::findAllInserts(this, getContext(),
                                              VMetadata.get(), &Inserts)) {
        // If we fail to find a sequence of insertelements, we stop the
        // optimization.
        if (Verbose) {
          getContext()->getStrDump() << "\tFalse alarm.\n";
        }
        continue;
      }
      if (Verbose) {
        getContext()->getStrDump() << "\tFound the following insertelement: \n";
        for (auto *I : reverse_range(Inserts)) {
          getContext()->getStrDump() << "\t\t";
          I->dump(this);
          getContext()->getStrDump() << "\n";
        }
      }
      Extracts.resize(Inserts.size());
      Variable *Src0;
      Variable *Src1;
      if (!ShuffleVectorUtils::findAllExtracts(this, getContext(),
                                               VMetadata.get(), Inserts, &Src0,
                                               &Src1, &Extracts)) {
        // If we fail to match the definitions of the insertelements' sources
        // with extractelement instructions -- or if those instructions operate
        // on more than two different variables -- we stop the optimization.
        if (Verbose) {
          getContext()->getStrDump() << "\tFailed to match extractelements.\n";
        }
        continue;
      }
      if (Verbose) {
        getContext()->getStrDump()
            << "\tFound the following insert/extract element pairs: \n";
        for (SizeT I = 0; I < Inserts.size(); ++I) {
          const SizeT Pos = Inserts.size() - I - 1;
          getContext()->getStrDump() << "\t\tInsert : ";
          Inserts[Pos]->dump(this);
          getContext()->getStrDump() << "\n\t\tExtract: ";
          Extracts[Pos]->dump(this);
          getContext()->getStrDump() << "\n";
        }
      }

      assert(Src0 != nullptr);
      assert(Src1 != nullptr);

      auto *ShuffleVector =
          InstShuffleVector::create(this, Instr.getDest(), Src0, Src1);
      assert(ShuffleVector->getSrc(0) == Src0);
      assert(ShuffleVector->getSrc(1) == Src1);
      for (SizeT I = 0; I < Extracts.size(); ++I) {
        const SizeT Pos = Extracts.size() - I - 1;
        auto *Index = llvm::cast<ConstantInteger32>(Extracts[Pos]->getSrc(1));
        if (Src0 == Extracts[Pos]->getSrc(0)) {
          ShuffleVector->addIndex(Index);
        } else {
          ShuffleVector->addIndex(llvm::cast<ConstantInteger32>(
              Ctx->getConstantInt32(Index->getValue() + Extracts.size())));
        }
      }

      if (Verbose) {
        getContext()->getStrDump() << "Created: ";
        ShuffleVector->dump(this);
        getContext()->getStrDump() << "\n";
      }

      Instr.setDeleted();
      auto &LoweringContext = getTarget()->getContext();
      LoweringContext.setInsertPoint(instToIterator(&Instr));
      LoweringContext.insert(ShuffleVector);
    }
  }
}

void Cfg::doNopInsertion() {
  if (!getFlags().getShouldDoNopInsertion())
    return;
  TimerMarker T(TimerStack::TT_doNopInsertion, this);
  RandomNumberGenerator RNG(getFlags().getRandomSeed(), RPE_NopInsertion,
                            SequenceNumber);
  for (CfgNode *Node : Nodes)
    Node->doNopInsertion(RNG);
}

void Cfg::genCode() {
  TimerMarker T(TimerStack::TT_genCode, this);
  for (CfgNode *Node : Nodes)
    Node->genCode();
}

// Compute the stack frame layout.
void Cfg::genFrame() {
  TimerMarker T(TimerStack::TT_genFrame, this);
  getTarget()->addProlog(Entry);
  for (CfgNode *Node : Nodes)
    if (Node->getHasReturn())
      getTarget()->addEpilog(Node);
}

void Cfg::generateLoopInfo() {
  TimerMarker T(TimerStack::TT_computeLoopNestDepth, this);
  LoopInfo = ComputeLoopInfo(this);
}

// This is a lightweight version of live-range-end calculation. Marks the last
// use of only those variables whose definition and uses are completely with a
// single block. It is a quick single pass and doesn't need to iterate until
// convergence.
void Cfg::livenessLightweight() {
  TimerMarker T(TimerStack::TT_livenessLightweight, this);
  getVMetadata()->init(VMK_Uses);
  for (CfgNode *Node : Nodes)
    Node->livenessLightweight();
}

void Cfg::liveness(LivenessMode Mode) {
  TimerMarker T(TimerStack::TT_liveness, this);
  // Destroying the previous (if any) Liveness information clears the Liveness
  // allocator TLS pointer.
  Live = nullptr;
  Live = Liveness::create(this, Mode);

  getVMetadata()->init(VMK_Uses);
  Live->init();

  // Initialize with all nodes needing to be processed.
  BitVector NeedToProcess(Nodes.size(), true);
  while (NeedToProcess.any()) {
    // Iterate in reverse topological order to speed up convergence.
    for (CfgNode *Node : reverse_range(Nodes)) {
      if (NeedToProcess[Node->getIndex()]) {
        NeedToProcess[Node->getIndex()] = false;
        bool Changed = Node->liveness(getLiveness());
        if (Changed) {
          // If the beginning-of-block liveness changed since the last
          // iteration, mark all in-edges as needing to be processed.
          for (CfgNode *Pred : Node->getInEdges())
            NeedToProcess[Pred->getIndex()] = true;
        }
      }
    }
  }
  if (Mode == Liveness_Intervals) {
    // Reset each variable's live range.
    for (Variable *Var : Variables)
      Var->resetLiveRange();
  }
  // Make a final pass over each node to delete dead instructions, collect the
  // first and last instruction numbers, and add live range segments for that
  // node.
  for (CfgNode *Node : Nodes) {
    InstNumberT FirstInstNum = Inst::NumberSentinel;
    InstNumberT LastInstNum = Inst::NumberSentinel;
    for (Inst &I : Node->getPhis()) {
      I.deleteIfDead();
      if (Mode == Liveness_Intervals && !I.isDeleted()) {
        if (FirstInstNum == Inst::NumberSentinel)
          FirstInstNum = I.getNumber();
        assert(I.getNumber() > LastInstNum);
        LastInstNum = I.getNumber();
      }
    }
    for (Inst &I : Node->getInsts()) {
      I.deleteIfDead();
      if (Mode == Liveness_Intervals && !I.isDeleted()) {
        if (FirstInstNum == Inst::NumberSentinel)
          FirstInstNum = I.getNumber();
        assert(I.getNumber() > LastInstNum);
        LastInstNum = I.getNumber();
      }
    }
    if (Mode == Liveness_Intervals) {
      // Special treatment for live in-args. Their liveness needs to extend
      // beyond the beginning of the function, otherwise an arg whose only use
      // is in the first instruction will end up having the trivial live range
      // [2,2) and will *not* interfere with other arguments. So if the first
      // instruction of the method is "r=arg1+arg2", both args may be assigned
      // the same register. This is accomplished by extending the entry block's
      // instruction range from [2,n) to [1,n) which will transform the
      // problematic [2,2) live ranges into [1,2).  This extension works because
      // the entry node is guaranteed to have the lowest instruction numbers.
      if (Node == getEntryNode()) {
        FirstInstNum = Inst::NumberExtended;
        // Just in case the entry node somehow contains no instructions...
        if (LastInstNum == Inst::NumberSentinel)
          LastInstNum = FirstInstNum;
      }
      // If this node somehow contains no instructions, don't bother trying to
      // add liveness intervals for it, because variables that are live-in and
      // live-out will have a bogus interval added.
      if (FirstInstNum != Inst::NumberSentinel)
        Node->livenessAddIntervals(getLiveness(), FirstInstNum, LastInstNum);
    }
  }
}

// Traverse every Variable of every Inst and verify that it appears within the
// Variable's computed live range.
bool Cfg::validateLiveness() const {
  TimerMarker T(TimerStack::TT_validateLiveness, this);
  bool Valid = true;
  OstreamLocker L(Ctx);
  Ostream &Str = Ctx->getStrDump();
  for (CfgNode *Node : Nodes) {
    Inst *FirstInst = nullptr;
    for (Inst &Instr : Node->getInsts()) {
      if (Instr.isDeleted())
        continue;
      if (FirstInst == nullptr)
        FirstInst = &Instr;
      InstNumberT InstNumber = Instr.getNumber();
      if (Variable *Dest = Instr.getDest()) {
        if (!Dest->getIgnoreLiveness()) {
          bool Invalid = false;
          constexpr bool IsDest = true;
          if (!Dest->getLiveRange().containsValue(InstNumber, IsDest))
            Invalid = true;
          // Check that this instruction actually *begins* Dest's live range,
          // by checking that Dest is not live in the previous instruction. As
          // a special exception, we don't check this for the first instruction
          // of the block, because a Phi temporary may be live at the end of
          // the previous block, and if it is also assigned in the first
          // instruction of this block, the adjacent live ranges get merged.
          if (&Instr != FirstInst && !Instr.isDestRedefined() &&
              Dest->getLiveRange().containsValue(InstNumber - 1, IsDest))
            Invalid = true;
          if (Invalid) {
            Valid = false;
            Str << "Liveness error: inst " << Instr.getNumber() << " dest ";
            Dest->dump(this);
            Str << " live range " << Dest->getLiveRange() << "\n";
          }
        }
      }
      FOREACH_VAR_IN_INST(Var, Instr) {
        static constexpr bool IsDest = false;
        if (!Var->getIgnoreLiveness() &&
            !Var->getLiveRange().containsValue(InstNumber, IsDest)) {
          Valid = false;
          Str << "Liveness error: inst " << Instr.getNumber() << " var ";
          Var->dump(this);
          Str << " live range " << Var->getLiveRange() << "\n";
        }
      }
    }
  }
  return Valid;
}

void Cfg::contractEmptyNodes() {
  // If we're decorating the asm output with register liveness info, this
  // information may become corrupted or incorrect after contracting nodes that
  // contain only redundant assignments. As such, we disable this pass when
  // DecorateAsm is specified. This may make the resulting code look more
  // branchy, but it should have no effect on the register assignments.
  if (getFlags().getDecorateAsm())
    return;
  for (CfgNode *Node : Nodes) {
    Node->contractIfEmpty();
  }
}

void Cfg::doBranchOpt() {
  TimerMarker T(TimerStack::TT_doBranchOpt, this);
  for (auto I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
    auto NextNode = I + 1;
    (*I)->doBranchOpt(NextNode == E ? nullptr : *NextNode);
  }
}

void Cfg::markNodesForSandboxing() {
  for (const InstJumpTable *JT : JumpTables)
    for (SizeT I = 0; I < JT->getNumTargets(); ++I)
      JT->getTarget(I)->setNeedsAlignment();
}

// ======================== Dump routines ======================== //

// emitTextHeader() is not target-specific (apart from what is abstracted by
// the Assembler), so it is defined here rather than in the target lowering
// class.
void Cfg::emitTextHeader(GlobalString Name, GlobalContext *Ctx,
                         const Assembler *Asm) {
  if (!BuildDefs::dump())
    return;
  Ostream &Str = Ctx->getStrEmit();
  Str << "\t.text\n";
  if (getFlags().getFunctionSections())
    Str << "\t.section\t.text." << Name << ",\"ax\",%progbits\n";
  if (!Asm->getInternal() || getFlags().getDisableInternal()) {
    Str << "\t.globl\t" << Name << "\n";
    Str << "\t.type\t" << Name << ",%function\n";
  }
  Str << "\t" << Asm->getAlignDirective() << " "
      << Asm->getBundleAlignLog2Bytes() << ",0x";
  for (uint8_t I : Asm->getNonExecBundlePadding())
    Str.write_hex(I);
  Str << "\n";
  Str << Name << ":\n";
}

void Cfg::emitJumpTables() {
  switch (getFlags().getOutFileType()) {
  case FT_Elf:
  case FT_Iasm: {
    // The emission needs to be delayed until the after the text section so
    // save the offsets in the global context.
    for (const InstJumpTable *JumpTable : JumpTables) {
      Ctx->addJumpTableData(JumpTable->toJumpTableData(getAssembler()));
    }
  } break;
  case FT_Asm: {
    // Emit the assembly directly so we don't need to hang on to all the names
    for (const InstJumpTable *JumpTable : JumpTables)
      getTarget()->emitJumpTable(this, JumpTable);
  } break;
  }
}

void Cfg::emit() {
  if (!BuildDefs::dump())
    return;
  TimerMarker T(TimerStack::TT_emitAsm, this);
  if (getFlags().getDecorateAsm()) {
    renumberInstructions();
    getVMetadata()->init(VMK_Uses);
    liveness(Liveness_Basic);
    dump("After recomputing liveness for -decorate-asm");
  }
  OstreamLocker L(Ctx);
  Ostream &Str = Ctx->getStrEmit();
  const Assembler *Asm = getAssembler<>();
  const bool NeedSandboxing = getFlags().getUseSandboxing();

  emitTextHeader(FunctionName, Ctx, Asm);
  if (getFlags().getDecorateAsm()) {
    for (Variable *Var : getVariables()) {
      if (Var->hasKnownStackOffset() && !Var->isRematerializable()) {
        Str << "\t" << Var->getSymbolicStackOffset() << " = "
            << Var->getStackOffset() << "\n";
      }
    }
  }
  for (CfgNode *Node : Nodes) {
    if (NeedSandboxing && Node->needsAlignment()) {
      Str << "\t" << Asm->getAlignDirective() << " "
          << Asm->getBundleAlignLog2Bytes() << "\n";
    }
    Node->emit(this);
  }
  emitJumpTables();
  Str << "\n";
}

void Cfg::emitIAS() {
  TimerMarker T(TimerStack::TT_emitAsm, this);
  // The emitIAS() routines emit into the internal assembler buffer, so there's
  // no need to lock the streams.
  const bool NeedSandboxing = getFlags().getUseSandboxing();
  for (CfgNode *Node : Nodes) {
    if (NeedSandboxing && Node->needsAlignment())
      getAssembler()->alignCfgNode();
    Node->emitIAS(this);
  }
  emitJumpTables();
}

size_t Cfg::getTotalMemoryMB() const {
  constexpr size_t _1MB = 1024 * 1024;
  assert(Allocator != nullptr);
  assert(CfgAllocatorTraits::current() == Allocator.get());
  return Allocator->getTotalMemory() / _1MB;
}

size_t Cfg::getLivenessMemoryMB() const {
  constexpr size_t _1MB = 1024 * 1024;
  if (Live == nullptr) {
    return 0;
  }
  return Live->getAllocator()->getTotalMemory() / _1MB;
}

// Dumps the IR with an optional introductory message.
void Cfg::dump(const char *Message) {
  if (!BuildDefs::dump())
    return;
  if (!isVerbose())
    return;
  OstreamLocker L(Ctx);
  Ostream &Str = Ctx->getStrDump();
  if (Message[0])
    Str << "================ " << Message << " ================\n";
  if (isVerbose(IceV_Mem)) {
    Str << "Memory size = " << getTotalMemoryMB() << " MB\n";
  }
  setCurrentNode(getEntryNode());
  // Print function name+args
  if (isVerbose(IceV_Instructions)) {
    Str << "define ";
    if (getInternal() && !getFlags().getDisableInternal())
      Str << "internal ";
    Str << ReturnType << " @" << getFunctionName() << "(";
    for (SizeT i = 0; i < Args.size(); ++i) {
      if (i > 0)
        Str << ", ";
      Str << Args[i]->getType() << " ";
      Args[i]->dump(this);
    }
    // Append an extra copy of the function name here, in order to print its
    // size stats but not mess up lit tests.
    Str << ") { # " << getFunctionNameAndSize() << "\n";
  }
  resetCurrentNode();
  if (isVerbose(IceV_Liveness)) {
    // Print summary info about variables
    for (Variable *Var : Variables) {
      Str << "// multiblock=";
      if (getVMetadata()->isTracked(Var))
        Str << getVMetadata()->isMultiBlock(Var);
      else
        Str << "?";
      Str << " defs=";
      bool FirstPrint = true;
      if (VMetadata->getKind() != VMK_Uses) {
        if (const Inst *FirstDef = VMetadata->getFirstDefinition(Var)) {
          Str << FirstDef->getNumber();
          FirstPrint = false;
        }
      }
      if (VMetadata->getKind() == VMK_All) {
        for (const Inst *Instr : VMetadata->getLatterDefinitions(Var)) {
          if (!FirstPrint)
            Str << ",";
          Str << Instr->getNumber();
          FirstPrint = false;
        }
      }
      Str << " weight=" << Var->getWeight(this) << " ";
      Var->dump(this);
      Str << " LIVE=" << Var->getLiveRange() << "\n";
    }
  }
  // Print each basic block
  for (CfgNode *Node : Nodes)
    Node->dump(this);
  if (isVerbose(IceV_Instructions))
    Str << "}\n";
}

} // end of namespace Ice