Home | History | Annotate | Download | only in CodeGen
      1 //===-- MachineBlockPlacement.cpp - Basic Block Code Layout optimization --===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file implements basic block placement transformations using the CFG
     11 // structure and branch probability estimates.
     12 //
     13 // The pass strives to preserve the structure of the CFG (that is, retain
     14 // a topological ordering of basic blocks) in the absence of a *strong* signal
     15 // to the contrary from probabilities. However, within the CFG structure, it
     16 // attempts to choose an ordering which favors placing more likely sequences of
     17 // blocks adjacent to each other.
     18 //
     19 // The algorithm works from the inner-most loop within a function outward, and
     20 // at each stage walks through the basic blocks, trying to coalesce them into
     21 // sequential chains where allowed by the CFG (or demanded by heavy
     22 // probabilities). Finally, it walks the blocks in topological order, and the
     23 // first time it reaches a chain of basic blocks, it schedules them in the
     24 // function in-order.
     25 //
     26 //===----------------------------------------------------------------------===//
     27 
     28 #include "llvm/CodeGen/Passes.h"
     29 #include "llvm/CodeGen/TargetPassConfig.h"
     30 #include "BranchFolding.h"
     31 #include "llvm/ADT/DenseMap.h"
     32 #include "llvm/ADT/SmallPtrSet.h"
     33 #include "llvm/ADT/SmallVector.h"
     34 #include "llvm/ADT/Statistic.h"
     35 #include "llvm/CodeGen/MachineBasicBlock.h"
     36 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
     37 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
     38 #include "llvm/CodeGen/MachineDominators.h"
     39 #include "llvm/CodeGen/MachineFunction.h"
     40 #include "llvm/CodeGen/MachineFunctionPass.h"
     41 #include "llvm/CodeGen/MachineLoopInfo.h"
     42 #include "llvm/CodeGen/MachineModuleInfo.h"
     43 #include "llvm/Support/Allocator.h"
     44 #include "llvm/Support/CommandLine.h"
     45 #include "llvm/Support/Debug.h"
     46 #include "llvm/Support/raw_ostream.h"
     47 #include "llvm/Target/TargetInstrInfo.h"
     48 #include "llvm/Target/TargetLowering.h"
     49 #include "llvm/Target/TargetSubtargetInfo.h"
     50 #include <algorithm>
     51 using namespace llvm;
     52 
     53 #define DEBUG_TYPE "block-placement"
     54 
     55 STATISTIC(NumCondBranches, "Number of conditional branches");
     56 STATISTIC(NumUncondBranches, "Number of unconditional branches");
     57 STATISTIC(CondBranchTakenFreq,
     58           "Potential frequency of taking conditional branches");
     59 STATISTIC(UncondBranchTakenFreq,
     60           "Potential frequency of taking unconditional branches");
     61 
     62 static cl::opt<unsigned> AlignAllBlock("align-all-blocks",
     63                                        cl::desc("Force the alignment of all "
     64                                                 "blocks in the function."),
     65                                        cl::init(0), cl::Hidden);
     66 
     67 static cl::opt<unsigned> AlignAllNonFallThruBlocks(
     68     "align-all-nofallthru-blocks",
     69     cl::desc("Force the alignment of all "
     70              "blocks that have no fall-through predecessors (i.e. don't add "
     71              "nops that are executed)."),
     72     cl::init(0), cl::Hidden);
     73 
     74 // FIXME: Find a good default for this flag and remove the flag.
     75 static cl::opt<unsigned> ExitBlockBias(
     76     "block-placement-exit-block-bias",
     77     cl::desc("Block frequency percentage a loop exit block needs "
     78              "over the original exit to be considered the new exit."),
     79     cl::init(0), cl::Hidden);
     80 
     81 static cl::opt<bool> OutlineOptionalBranches(
     82     "outline-optional-branches",
     83     cl::desc("Put completely optional branches, i.e. branches with a common "
     84              "post dominator, out of line."),
     85     cl::init(false), cl::Hidden);
     86 
     87 static cl::opt<unsigned> OutlineOptionalThreshold(
     88     "outline-optional-threshold",
     89     cl::desc("Don't outline optional branches that are a single block with an "
     90              "instruction count below this threshold"),
     91     cl::init(4), cl::Hidden);
     92 
     93 static cl::opt<unsigned> LoopToColdBlockRatio(
     94     "loop-to-cold-block-ratio",
     95     cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
     96              "(frequency of block) is greater than this ratio"),
     97     cl::init(5), cl::Hidden);
     98 
     99 static cl::opt<bool>
    100     PreciseRotationCost("precise-rotation-cost",
    101                         cl::desc("Model the cost of loop rotation more "
    102                                  "precisely by using profile data."),
    103                         cl::init(false), cl::Hidden);
    104 static cl::opt<bool>
    105     ForcePreciseRotationCost("force-precise-rotation-cost",
    106                              cl::desc("Force the use of precise cost "
    107                                       "loop rotation strategy."),
    108                              cl::init(false), cl::Hidden);
    109 
    110 static cl::opt<unsigned> MisfetchCost(
    111     "misfetch-cost",
    112     cl::desc("Cost that models the probablistic risk of an instruction "
    113              "misfetch due to a jump comparing to falling through, whose cost "
    114              "is zero."),
    115     cl::init(1), cl::Hidden);
    116 
    117 static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
    118                                       cl::desc("Cost of jump instructions."),
    119                                       cl::init(1), cl::Hidden);
    120 
    121 static cl::opt<bool>
    122 BranchFoldPlacement("branch-fold-placement",
    123               cl::desc("Perform branch folding during placement. "
    124                        "Reduces code size."),
    125               cl::init(true), cl::Hidden);
    126 
    127 extern cl::opt<unsigned> StaticLikelyProb;
    128 extern cl::opt<unsigned> ProfileLikelyProb;
    129 
    130 namespace {
    131 class BlockChain;
    132 /// \brief Type for our function-wide basic block -> block chain mapping.
    133 typedef DenseMap<MachineBasicBlock *, BlockChain *> BlockToChainMapType;
    134 }
    135 
    136 namespace {
    137 /// \brief A chain of blocks which will be laid out contiguously.
    138 ///
    139 /// This is the datastructure representing a chain of consecutive blocks that
    140 /// are profitable to layout together in order to maximize fallthrough
    141 /// probabilities and code locality. We also can use a block chain to represent
    142 /// a sequence of basic blocks which have some external (correctness)
    143 /// requirement for sequential layout.
    144 ///
    145 /// Chains can be built around a single basic block and can be merged to grow
    146 /// them. They participate in a block-to-chain mapping, which is updated
    147 /// automatically as chains are merged together.
    148 class BlockChain {
    149   /// \brief The sequence of blocks belonging to this chain.
    150   ///
    151   /// This is the sequence of blocks for a particular chain. These will be laid
    152   /// out in-order within the function.
    153   SmallVector<MachineBasicBlock *, 4> Blocks;
    154 
    155   /// \brief A handle to the function-wide basic block to block chain mapping.
    156   ///
    157   /// This is retained in each block chain to simplify the computation of child
    158   /// block chains for SCC-formation and iteration. We store the edges to child
    159   /// basic blocks, and map them back to their associated chains using this
    160   /// structure.
    161   BlockToChainMapType &BlockToChain;
    162 
    163 public:
    164   /// \brief Construct a new BlockChain.
    165   ///
    166   /// This builds a new block chain representing a single basic block in the
    167   /// function. It also registers itself as the chain that block participates
    168   /// in with the BlockToChain mapping.
    169   BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
    170       : Blocks(1, BB), BlockToChain(BlockToChain), UnscheduledPredecessors(0) {
    171     assert(BB && "Cannot create a chain with a null basic block");
    172     BlockToChain[BB] = this;
    173   }
    174 
    175   /// \brief Iterator over blocks within the chain.
    176   typedef SmallVectorImpl<MachineBasicBlock *>::iterator iterator;
    177 
    178   /// \brief Beginning of blocks within the chain.
    179   iterator begin() { return Blocks.begin(); }
    180 
    181   /// \brief End of blocks within the chain.
    182   iterator end() { return Blocks.end(); }
    183 
    184   /// \brief Merge a block chain into this one.
    185   ///
    186   /// This routine merges a block chain into this one. It takes care of forming
    187   /// a contiguous sequence of basic blocks, updating the edge list, and
    188   /// updating the block -> chain mapping. It does not free or tear down the
    189   /// old chain, but the old chain's block list is no longer valid.
    190   void merge(MachineBasicBlock *BB, BlockChain *Chain) {
    191     assert(BB);
    192     assert(!Blocks.empty());
    193 
    194     // Fast path in case we don't have a chain already.
    195     if (!Chain) {
    196       assert(!BlockToChain[BB]);
    197       Blocks.push_back(BB);
    198       BlockToChain[BB] = this;
    199       return;
    200     }
    201 
    202     assert(BB == *Chain->begin());
    203     assert(Chain->begin() != Chain->end());
    204 
    205     // Update the incoming blocks to point to this chain, and add them to the
    206     // chain structure.
    207     for (MachineBasicBlock *ChainBB : *Chain) {
    208       Blocks.push_back(ChainBB);
    209       assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain");
    210       BlockToChain[ChainBB] = this;
    211     }
    212   }
    213 
    214 #ifndef NDEBUG
    215   /// \brief Dump the blocks in this chain.
    216   LLVM_DUMP_METHOD void dump() {
    217     for (MachineBasicBlock *MBB : *this)
    218       MBB->dump();
    219   }
    220 #endif // NDEBUG
    221 
    222   /// \brief Count of predecessors of any block within the chain which have not
    223   /// yet been scheduled.  In general, we will delay scheduling this chain
    224   /// until those predecessors are scheduled (or we find a sufficiently good
    225   /// reason to override this heuristic.)  Note that when forming loop chains,
    226   /// blocks outside the loop are ignored and treated as if they were already
    227   /// scheduled.
    228   ///
    229   /// Note: This field is reinitialized multiple times - once for each loop,
    230   /// and then once for the function as a whole.
    231   unsigned UnscheduledPredecessors;
    232 };
    233 }
    234 
    235 namespace {
    236 class MachineBlockPlacement : public MachineFunctionPass {
    237   /// \brief A typedef for a block filter set.
    238   typedef SmallPtrSet<MachineBasicBlock *, 16> BlockFilterSet;
    239 
    240   /// \brief work lists of blocks that are ready to be laid out
    241   SmallVector<MachineBasicBlock *, 16> BlockWorkList;
    242   SmallVector<MachineBasicBlock *, 16> EHPadWorkList;
    243 
    244   /// \brief Machine Function
    245   MachineFunction *F;
    246 
    247   /// \brief A handle to the branch probability pass.
    248   const MachineBranchProbabilityInfo *MBPI;
    249 
    250   /// \brief A handle to the function-wide block frequency pass.
    251   std::unique_ptr<BranchFolder::MBFIWrapper> MBFI;
    252 
    253   /// \brief A handle to the loop info.
    254   MachineLoopInfo *MLI;
    255 
    256   /// \brief A handle to the target's instruction info.
    257   const TargetInstrInfo *TII;
    258 
    259   /// \brief A handle to the target's lowering info.
    260   const TargetLoweringBase *TLI;
    261 
    262   /// \brief A handle to the post dominator tree.
    263   MachineDominatorTree *MDT;
    264 
    265   /// \brief A set of blocks that are unavoidably execute, i.e. they dominate
    266   /// all terminators of the MachineFunction.
    267   SmallPtrSet<MachineBasicBlock *, 4> UnavoidableBlocks;
    268 
    269   /// \brief Allocator and owner of BlockChain structures.
    270   ///
    271   /// We build BlockChains lazily while processing the loop structure of
    272   /// a function. To reduce malloc traffic, we allocate them using this
    273   /// slab-like allocator, and destroy them after the pass completes. An
    274   /// important guarantee is that this allocator produces stable pointers to
    275   /// the chains.
    276   SpecificBumpPtrAllocator<BlockChain> ChainAllocator;
    277 
    278   /// \brief Function wide BasicBlock to BlockChain mapping.
    279   ///
    280   /// This mapping allows efficiently moving from any given basic block to the
    281   /// BlockChain it participates in, if any. We use it to, among other things,
    282   /// allow implicitly defining edges between chains as the existing edges
    283   /// between basic blocks.
    284   DenseMap<MachineBasicBlock *, BlockChain *> BlockToChain;
    285 
    286   void markChainSuccessors(BlockChain &Chain, MachineBasicBlock *LoopHeaderBB,
    287                            const BlockFilterSet *BlockFilter = nullptr);
    288   BranchProbability
    289   collectViableSuccessors(MachineBasicBlock *BB, BlockChain &Chain,
    290                           const BlockFilterSet *BlockFilter,
    291                           SmallVector<MachineBasicBlock *, 4> &Successors);
    292   bool shouldPredBlockBeOutlined(MachineBasicBlock *BB, MachineBasicBlock *Succ,
    293                                  BlockChain &Chain,
    294                                  const BlockFilterSet *BlockFilter,
    295                                  BranchProbability SuccProb,
    296                                  BranchProbability HotProb);
    297   bool
    298   hasBetterLayoutPredecessor(MachineBasicBlock *BB, MachineBasicBlock *Succ,
    299                              BlockChain &SuccChain, BranchProbability SuccProb,
    300                              BranchProbability RealSuccProb, BlockChain &Chain,
    301                              const BlockFilterSet *BlockFilter);
    302   MachineBasicBlock *selectBestSuccessor(MachineBasicBlock *BB,
    303                                          BlockChain &Chain,
    304                                          const BlockFilterSet *BlockFilter);
    305   MachineBasicBlock *
    306   selectBestCandidateBlock(BlockChain &Chain,
    307                            SmallVectorImpl<MachineBasicBlock *> &WorkList);
    308   MachineBasicBlock *
    309   getFirstUnplacedBlock(const BlockChain &PlacedChain,
    310                         MachineFunction::iterator &PrevUnplacedBlockIt,
    311                         const BlockFilterSet *BlockFilter);
    312 
    313   /// \brief Add a basic block to the work list if it is apropriate.
    314   ///
    315   /// If the optional parameter BlockFilter is provided, only MBB
    316   /// present in the set will be added to the worklist. If nullptr
    317   /// is provided, no filtering occurs.
    318   void fillWorkLists(MachineBasicBlock *MBB,
    319                      SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
    320                      const BlockFilterSet *BlockFilter);
    321   void buildChain(MachineBasicBlock *BB, BlockChain &Chain,
    322                   const BlockFilterSet *BlockFilter = nullptr);
    323   MachineBasicBlock *findBestLoopTop(MachineLoop &L,
    324                                      const BlockFilterSet &LoopBlockSet);
    325   MachineBasicBlock *findBestLoopExit(MachineLoop &L,
    326                                       const BlockFilterSet &LoopBlockSet);
    327   BlockFilterSet collectLoopBlockSet(MachineLoop &L);
    328   void buildLoopChains(MachineLoop &L);
    329   void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB,
    330                   const BlockFilterSet &LoopBlockSet);
    331   void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L,
    332                              const BlockFilterSet &LoopBlockSet);
    333   void collectMustExecuteBBs();
    334   void buildCFGChains();
    335   void optimizeBranches();
    336   void alignBlocks();
    337 
    338 public:
    339   static char ID; // Pass identification, replacement for typeid
    340   MachineBlockPlacement() : MachineFunctionPass(ID) {
    341     initializeMachineBlockPlacementPass(*PassRegistry::getPassRegistry());
    342   }
    343 
    344   bool runOnMachineFunction(MachineFunction &F) override;
    345 
    346   void getAnalysisUsage(AnalysisUsage &AU) const override {
    347     AU.addRequired<MachineBranchProbabilityInfo>();
    348     AU.addRequired<MachineBlockFrequencyInfo>();
    349     AU.addRequired<MachineDominatorTree>();
    350     AU.addRequired<MachineLoopInfo>();
    351     AU.addRequired<TargetPassConfig>();
    352     MachineFunctionPass::getAnalysisUsage(AU);
    353   }
    354 };
    355 }
    356 
    357 char MachineBlockPlacement::ID = 0;
    358 char &llvm::MachineBlockPlacementID = MachineBlockPlacement::ID;
    359 INITIALIZE_PASS_BEGIN(MachineBlockPlacement, "block-placement",
    360                       "Branch Probability Basic Block Placement", false, false)
    361 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
    362 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
    363 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
    364 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
    365 INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement",
    366                     "Branch Probability Basic Block Placement", false, false)
    367 
    368 #ifndef NDEBUG
    369 /// \brief Helper to print the name of a MBB.
    370 ///
    371 /// Only used by debug logging.
    372 static std::string getBlockName(MachineBasicBlock *BB) {
    373   std::string Result;
    374   raw_string_ostream OS(Result);
    375   OS << "BB#" << BB->getNumber();
    376   OS << " ('" << BB->getName() << "')";
    377   OS.flush();
    378   return Result;
    379 }
    380 #endif
    381 
    382 /// \brief Mark a chain's successors as having one fewer preds.
    383 ///
    384 /// When a chain is being merged into the "placed" chain, this routine will
    385 /// quickly walk the successors of each block in the chain and mark them as
    386 /// having one fewer active predecessor. It also adds any successors of this
    387 /// chain which reach the zero-predecessor state to the worklist passed in.
    388 void MachineBlockPlacement::markChainSuccessors(
    389     BlockChain &Chain, MachineBasicBlock *LoopHeaderBB,
    390     const BlockFilterSet *BlockFilter) {
    391   // Walk all the blocks in this chain, marking their successors as having
    392   // a predecessor placed.
    393   for (MachineBasicBlock *MBB : Chain) {
    394     // Add any successors for which this is the only un-placed in-loop
    395     // predecessor to the worklist as a viable candidate for CFG-neutral
    396     // placement. No subsequent placement of this block will violate the CFG
    397     // shape, so we get to use heuristics to choose a favorable placement.
    398     for (MachineBasicBlock *Succ : MBB->successors()) {
    399       if (BlockFilter && !BlockFilter->count(Succ))
    400         continue;
    401       BlockChain &SuccChain = *BlockToChain[Succ];
    402       // Disregard edges within a fixed chain, or edges to the loop header.
    403       if (&Chain == &SuccChain || Succ == LoopHeaderBB)
    404         continue;
    405 
    406       // This is a cross-chain edge that is within the loop, so decrement the
    407       // loop predecessor count of the destination chain.
    408       if (SuccChain.UnscheduledPredecessors == 0 ||
    409           --SuccChain.UnscheduledPredecessors > 0)
    410         continue;
    411 
    412       auto *MBB = *SuccChain.begin();
    413       if (MBB->isEHPad())
    414         EHPadWorkList.push_back(MBB);
    415       else
    416         BlockWorkList.push_back(MBB);
    417     }
    418   }
    419 }
    420 
    421 /// This helper function collects the set of successors of block
    422 /// \p BB that are allowed to be its layout successors, and return
    423 /// the total branch probability of edges from \p BB to those
    424 /// blocks.
    425 BranchProbability MachineBlockPlacement::collectViableSuccessors(
    426     MachineBasicBlock *BB, BlockChain &Chain, const BlockFilterSet *BlockFilter,
    427     SmallVector<MachineBasicBlock *, 4> &Successors) {
    428   // Adjust edge probabilities by excluding edges pointing to blocks that is
    429   // either not in BlockFilter or is already in the current chain. Consider the
    430   // following CFG:
    431   //
    432   //     --->A
    433   //     |  / \
    434   //     | B   C
    435   //     |  \ / \
    436   //     ----D   E
    437   //
    438   // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
    439   // A->C is chosen as a fall-through, D won't be selected as a successor of C
    440   // due to CFG constraint (the probability of C->D is not greater than
    441   // HotProb to break top-oorder). If we exclude E that is not in BlockFilter
    442   // when calculating the  probability of C->D, D will be selected and we
    443   // will get A C D B as the layout of this loop.
    444   auto AdjustedSumProb = BranchProbability::getOne();
    445   for (MachineBasicBlock *Succ : BB->successors()) {
    446     bool SkipSucc = false;
    447     if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
    448       SkipSucc = true;
    449     } else {
    450       BlockChain *SuccChain = BlockToChain[Succ];
    451       if (SuccChain == &Chain) {
    452         SkipSucc = true;
    453       } else if (Succ != *SuccChain->begin()) {
    454         DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> Mid chain!\n");
    455         continue;
    456       }
    457     }
    458     if (SkipSucc)
    459       AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
    460     else
    461       Successors.push_back(Succ);
    462   }
    463 
    464   return AdjustedSumProb;
    465 }
    466 
    467 /// The helper function returns the branch probability that is adjusted
    468 /// or normalized over the new total \p AdjustedSumProb.
    469 
    470 static BranchProbability
    471 getAdjustedProbability(BranchProbability OrigProb,
    472                        BranchProbability AdjustedSumProb) {
    473   BranchProbability SuccProb;
    474   uint32_t SuccProbN = OrigProb.getNumerator();
    475   uint32_t SuccProbD = AdjustedSumProb.getNumerator();
    476   if (SuccProbN >= SuccProbD)
    477     SuccProb = BranchProbability::getOne();
    478   else
    479     SuccProb = BranchProbability(SuccProbN, SuccProbD);
    480 
    481   return SuccProb;
    482 }
    483 
    484 /// When the option OutlineOptionalBranches is on, this method
    485 /// checks if the fallthrough candidate block \p Succ (of block
    486 /// \p BB) also has other unscheduled predecessor blocks which
    487 /// are also successors of \p BB (forming triagular shape CFG).
    488 /// If none of such predecessors are small, it returns true.
    489 /// The caller can choose to select \p Succ as the layout successors
    490 /// so that \p Succ's predecessors (optional branches) can be
    491 /// outlined.
    492 /// FIXME: fold this with more general layout cost analysis.
    493 bool MachineBlockPlacement::shouldPredBlockBeOutlined(
    494     MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain,
    495     const BlockFilterSet *BlockFilter, BranchProbability SuccProb,
    496     BranchProbability HotProb) {
    497   if (!OutlineOptionalBranches)
    498     return false;
    499   // If we outline optional branches, look whether Succ is unavoidable, i.e.
    500   // dominates all terminators of the MachineFunction. If it does, other
    501   // successors must be optional. Don't do this for cold branches.
    502   if (SuccProb > HotProb.getCompl() && UnavoidableBlocks.count(Succ) > 0) {
    503     for (MachineBasicBlock *Pred : Succ->predecessors()) {
    504       // Check whether there is an unplaced optional branch.
    505       if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) ||
    506           BlockToChain[Pred] == &Chain)
    507         continue;
    508       // Check whether the optional branch has exactly one BB.
    509       if (Pred->pred_size() > 1 || *Pred->pred_begin() != BB)
    510         continue;
    511       // Check whether the optional branch is small.
    512       if (Pred->size() < OutlineOptionalThreshold)
    513         return false;
    514     }
    515     return true;
    516   } else
    517     return false;
    518 }
    519 
    520 // When profile is not present, return the StaticLikelyProb.
    521 // When profile is available, we need to handle the triangle-shape CFG.
    522 static BranchProbability getLayoutSuccessorProbThreshold(
    523       MachineBasicBlock *BB) {
    524   if (!BB->getParent()->getFunction()->getEntryCount())
    525     return BranchProbability(StaticLikelyProb, 100);
    526   if (BB->succ_size() == 2) {
    527     const MachineBasicBlock *Succ1 = *BB->succ_begin();
    528     const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
    529     if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
    530       /* See case 1 below for the cost analysis. For BB->Succ to
    531        * be taken with smaller cost, the following needs to hold:
    532        *   Prob(BB->Succ) > 2* Prob(BB->Pred)
    533        *   So the threshold T
    534        *   T = 2 * (1-Prob(BB->Pred). Since T + Prob(BB->Pred) == 1,
    535        * We have  T + T/2 = 1, i.e. T = 2/3. Also adding user specified
    536        * branch bias, we have
    537        *   T = (2/3)*(ProfileLikelyProb/50)
    538        *     = (2*ProfileLikelyProb)/150)
    539        */
    540       return BranchProbability(2 * ProfileLikelyProb, 150);
    541     }
    542   }
    543   return BranchProbability(ProfileLikelyProb, 100);
    544 }
    545 
    546 /// Checks to see if the layout candidate block \p Succ has a better layout
    547 /// predecessor than \c BB. If yes, returns true.
    548 bool MachineBlockPlacement::hasBetterLayoutPredecessor(
    549     MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &SuccChain,
    550     BranchProbability SuccProb, BranchProbability RealSuccProb,
    551     BlockChain &Chain, const BlockFilterSet *BlockFilter) {
    552 
    553   // This is no global conflict, just return false.
    554   if (SuccChain.UnscheduledPredecessors == 0)
    555     return false;
    556 
    557   // There are two basic scenarios here:
    558   // -------------------------------------
    559   // Case 1: triagular shape CFG:
    560   //     BB
    561   //     | \
    562   //     |  \
    563   //     |   Pred
    564   //     |   /
    565   //     Succ
    566   // In this case, we are evaluating whether to select edge -> Succ, e.g.
    567   // set Succ as the layout successor of BB. Picking Succ as BB's
    568   // successor breaks the  CFG constraints. With this layout, Pred BB
    569   // is forced to be outlined, so the overall cost will be cost of the
    570   // branch taken from BB to Pred, plus the cost of back taken branch
    571   // from Pred to Succ, as well as the additional cost asssociated
    572   // with the needed unconditional jump instruction from Pred To Succ.
    573   // The cost of the topological order layout is the taken branch cost
    574   // from BB to Succ, so to make BB->Succ a viable candidate, the following
    575   // must hold:
    576   //     2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
    577   //      < freq(BB->Succ) *  taken_branch_cost.
    578   // Ignoring unconditional jump cost, we get
    579   //    freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
    580   //    prob(BB->Succ) > 2 * prob(BB->Pred)
    581   //
    582   // When real profile data is available, we can precisely compute the the
    583   // probabililty threshold that is needed for edge BB->Succ to be considered.
    584   // With out profile data, the heuristic requires the branch bias to be
    585   // a lot larger to make sure the signal is very strong (e.g. 80% default).
    586   // -----------------------------------------------------------------
    587   // Case 2: diamond like CFG:
    588   //     S
    589   //    / \
    590   //   |   \
    591   //  BB    Pred
    592   //   \    /
    593   //    Succ
    594   //    ..
    595   // In this case, edge S->BB has already been selected, and we are evaluating
    596   // candidate edge BB->Succ. Edge S->BB is selected because prob(S->BB)
    597   // is no less than prob(S->Pred). When real profile data is *available*, if
    598   // the condition is true, it will be always better to continue the trace with
    599   // edge BB->Succ instead of laying out with topological order (i.e. laying
    600   // Pred first).  The cost of S->BB->Succ is 2 * freq (S->Pred), while with
    601   // the topo order, the cost is freq(S-> Pred) + Pred(S->BB) which is larger.
    602   // When profile data is not available, however, we need to be more
    603   // conservative. If the branch prediction is wrong, breaking the topo-order
    604   // will actually yield a layout with large cost. For this reason, we need
    605   // strong biaaed branch at block S with Prob(S->BB) in order to select
    606   // BB->Succ. This is equialant to looking the CFG backward with backward
    607   // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
    608   // profile data).
    609 
    610   BranchProbability HotProb = getLayoutSuccessorProbThreshold(BB);
    611 
    612   // Forward checking. For case 2, SuccProb will be 1.
    613   if (SuccProb < HotProb) {
    614     DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> " << SuccProb
    615                  << " (prob) (CFG conflict)\n");
    616     return true;
    617   }
    618 
    619   // Make sure that a hot successor doesn't have a globally more
    620   // important predecessor.
    621   BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
    622   bool BadCFGConflict = false;
    623 
    624   for (MachineBasicBlock *Pred : Succ->predecessors()) {
    625     if (Pred == Succ || BlockToChain[Pred] == &SuccChain ||
    626         (BlockFilter && !BlockFilter->count(Pred)) ||
    627         BlockToChain[Pred] == &Chain)
    628       continue;
    629     // Do backward checking. For case 1, it is actually redundant check. For
    630     // case 2 above, we need a backward checking to filter out edges that are
    631     // not 'strongly' biased. With profile data available, the check is mostly
    632     // redundant too (when threshold prob is set at 50%) unless S has more than
    633     // two successors.
    634     // BB  Pred
    635     //  \ /
    636     //  Succ
    637     // We select edgee BB->Succ if
    638     //      freq(BB->Succ) > freq(Succ) * HotProb
    639     //      i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
    640     //      HotProb
    641     //      i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
    642     BlockFrequency PredEdgeFreq =
    643         MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
    644     if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
    645       BadCFGConflict = true;
    646       break;
    647     }
    648   }
    649 
    650   if (BadCFGConflict) {
    651     DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> " << SuccProb
    652                  << " (prob) (non-cold CFG conflict)\n");
    653     return true;
    654   }
    655 
    656   return false;
    657 }
    658 
    659 /// \brief Select the best successor for a block.
    660 ///
    661 /// This looks across all successors of a particular block and attempts to
    662 /// select the "best" one to be the layout successor. It only considers direct
    663 /// successors which also pass the block filter. It will attempt to avoid
    664 /// breaking CFG structure, but cave and break such structures in the case of
    665 /// very hot successor edges.
    666 ///
    667 /// \returns The best successor block found, or null if none are viable.
    668 MachineBasicBlock *
    669 MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
    670                                            BlockChain &Chain,
    671                                            const BlockFilterSet *BlockFilter) {
    672   const BranchProbability HotProb(StaticLikelyProb, 100);
    673 
    674   MachineBasicBlock *BestSucc = nullptr;
    675   auto BestProb = BranchProbability::getZero();
    676 
    677   SmallVector<MachineBasicBlock *, 4> Successors;
    678   auto AdjustedSumProb =
    679       collectViableSuccessors(BB, Chain, BlockFilter, Successors);
    680 
    681   DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n");
    682   for (MachineBasicBlock *Succ : Successors) {
    683     auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
    684     BranchProbability SuccProb =
    685         getAdjustedProbability(RealSuccProb, AdjustedSumProb);
    686 
    687     // This heuristic is off by default.
    688     if (shouldPredBlockBeOutlined(BB, Succ, Chain, BlockFilter, SuccProb,
    689                                   HotProb))
    690       return Succ;
    691 
    692     BlockChain &SuccChain = *BlockToChain[Succ];
    693     // Skip the edge \c BB->Succ if block \c Succ has a better layout
    694     // predecessor that yields lower global cost.
    695     if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
    696                                    Chain, BlockFilter))
    697       continue;
    698 
    699     DEBUG(
    700         dbgs() << "    " << getBlockName(Succ) << " -> " << SuccProb
    701                << " (prob)"
    702                << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
    703                << "\n");
    704     if (BestSucc && BestProb >= SuccProb)
    705       continue;
    706     BestSucc = Succ;
    707     BestProb = SuccProb;
    708   }
    709   return BestSucc;
    710 }
    711 
    712 /// \brief Select the best block from a worklist.
    713 ///
    714 /// This looks through the provided worklist as a list of candidate basic
    715 /// blocks and select the most profitable one to place. The definition of
    716 /// profitable only really makes sense in the context of a loop. This returns
    717 /// the most frequently visited block in the worklist, which in the case of
    718 /// a loop, is the one most desirable to be physically close to the rest of the
    719 /// loop body in order to improve icache behavior.
    720 ///
    721 /// \returns The best block found, or null if none are viable.
    722 MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
    723     BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
    724   // Once we need to walk the worklist looking for a candidate, cleanup the
    725   // worklist of already placed entries.
    726   // FIXME: If this shows up on profiles, it could be folded (at the cost of
    727   // some code complexity) into the loop below.
    728   WorkList.erase(std::remove_if(WorkList.begin(), WorkList.end(),
    729                                 [&](MachineBasicBlock *BB) {
    730                                   return BlockToChain.lookup(BB) == &Chain;
    731                                 }),
    732                  WorkList.end());
    733 
    734   if (WorkList.empty())
    735     return nullptr;
    736 
    737   bool IsEHPad = WorkList[0]->isEHPad();
    738 
    739   MachineBasicBlock *BestBlock = nullptr;
    740   BlockFrequency BestFreq;
    741   for (MachineBasicBlock *MBB : WorkList) {
    742     assert(MBB->isEHPad() == IsEHPad);
    743 
    744     BlockChain &SuccChain = *BlockToChain[MBB];
    745     if (&SuccChain == &Chain)
    746       continue;
    747 
    748     assert(SuccChain.UnscheduledPredecessors == 0 && "Found CFG-violating block");
    749 
    750     BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
    751     DEBUG(dbgs() << "    " << getBlockName(MBB) << " -> ";
    752           MBFI->printBlockFreq(dbgs(), CandidateFreq) << " (freq)\n");
    753 
    754     // For ehpad, we layout the least probable first as to avoid jumping back
    755     // from least probable landingpads to more probable ones.
    756     //
    757     // FIXME: Using probability is probably (!) not the best way to achieve
    758     // this. We should probably have a more principled approach to layout
    759     // cleanup code.
    760     //
    761     // The goal is to get:
    762     //
    763     //                 +--------------------------+
    764     //                 |                          V
    765     // InnerLp -> InnerCleanup    OuterLp -> OuterCleanup -> Resume
    766     //
    767     // Rather than:
    768     //
    769     //                 +-------------------------------------+
    770     //                 V                                     |
    771     // OuterLp -> OuterCleanup -> Resume     InnerLp -> InnerCleanup
    772     if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
    773       continue;
    774 
    775     BestBlock = MBB;
    776     BestFreq = CandidateFreq;
    777   }
    778 
    779   return BestBlock;
    780 }
    781 
    782 /// \brief Retrieve the first unplaced basic block.
    783 ///
    784 /// This routine is called when we are unable to use the CFG to walk through
    785 /// all of the basic blocks and form a chain due to unnatural loops in the CFG.
    786 /// We walk through the function's blocks in order, starting from the
    787 /// LastUnplacedBlockIt. We update this iterator on each call to avoid
    788 /// re-scanning the entire sequence on repeated calls to this routine.
    789 MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
    790     const BlockChain &PlacedChain,
    791     MachineFunction::iterator &PrevUnplacedBlockIt,
    792     const BlockFilterSet *BlockFilter) {
    793   for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
    794        ++I) {
    795     if (BlockFilter && !BlockFilter->count(&*I))
    796       continue;
    797     if (BlockToChain[&*I] != &PlacedChain) {
    798       PrevUnplacedBlockIt = I;
    799       // Now select the head of the chain to which the unplaced block belongs
    800       // as the block to place. This will force the entire chain to be placed,
    801       // and satisfies the requirements of merging chains.
    802       return *BlockToChain[&*I]->begin();
    803     }
    804   }
    805   return nullptr;
    806 }
    807 
    808 void MachineBlockPlacement::fillWorkLists(
    809     MachineBasicBlock *MBB,
    810     SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
    811     const BlockFilterSet *BlockFilter = nullptr) {
    812   BlockChain &Chain = *BlockToChain[MBB];
    813   if (!UpdatedPreds.insert(&Chain).second)
    814     return;
    815 
    816   assert(Chain.UnscheduledPredecessors == 0);
    817   for (MachineBasicBlock *ChainBB : Chain) {
    818     assert(BlockToChain[ChainBB] == &Chain);
    819     for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
    820       if (BlockFilter && !BlockFilter->count(Pred))
    821         continue;
    822       if (BlockToChain[Pred] == &Chain)
    823         continue;
    824       ++Chain.UnscheduledPredecessors;
    825     }
    826   }
    827 
    828   if (Chain.UnscheduledPredecessors != 0)
    829     return;
    830 
    831   MBB = *Chain.begin();
    832   if (MBB->isEHPad())
    833     EHPadWorkList.push_back(MBB);
    834   else
    835     BlockWorkList.push_back(MBB);
    836 }
    837 
    838 void MachineBlockPlacement::buildChain(
    839     MachineBasicBlock *BB, BlockChain &Chain,
    840     const BlockFilterSet *BlockFilter) {
    841   assert(BB && "BB must not be null.\n");
    842   assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match.\n");
    843   MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
    844 
    845   MachineBasicBlock *LoopHeaderBB = BB;
    846   markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
    847   BB = *std::prev(Chain.end());
    848   for (;;) {
    849     assert(BB && "null block found at end of chain in loop.");
    850     assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
    851     assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
    852 
    853 
    854     // Look for the best viable successor if there is one to place immediately
    855     // after this block.
    856     MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter);
    857 
    858     // If an immediate successor isn't available, look for the best viable
    859     // block among those we've identified as not violating the loop's CFG at
    860     // this point. This won't be a fallthrough, but it will increase locality.
    861     if (!BestSucc)
    862       BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
    863     if (!BestSucc)
    864       BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
    865 
    866     if (!BestSucc) {
    867       BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt, BlockFilter);
    868       if (!BestSucc)
    869         break;
    870 
    871       DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
    872                       "layout successor until the CFG reduces\n");
    873     }
    874 
    875     // Place this block, updating the datastructures to reflect its placement.
    876     BlockChain &SuccChain = *BlockToChain[BestSucc];
    877     // Zero out UnscheduledPredecessors for the successor we're about to merge in case
    878     // we selected a successor that didn't fit naturally into the CFG.
    879     SuccChain.UnscheduledPredecessors = 0;
    880     DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
    881                  << getBlockName(BestSucc) << "\n");
    882     markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
    883     Chain.merge(BestSucc, &SuccChain);
    884     BB = *std::prev(Chain.end());
    885   }
    886 
    887   DEBUG(dbgs() << "Finished forming chain for header block "
    888                << getBlockName(*Chain.begin()) << "\n");
    889 }
    890 
    891 /// \brief Find the best loop top block for layout.
    892 ///
    893 /// Look for a block which is strictly better than the loop header for laying
    894 /// out at the top of the loop. This looks for one and only one pattern:
    895 /// a latch block with no conditional exit. This block will cause a conditional
    896 /// jump around it or will be the bottom of the loop if we lay it out in place,
    897 /// but if it it doesn't end up at the bottom of the loop for any reason,
    898 /// rotation alone won't fix it. Because such a block will always result in an
    899 /// unconditional jump (for the backedge) rotating it in front of the loop
    900 /// header is always profitable.
    901 MachineBasicBlock *
    902 MachineBlockPlacement::findBestLoopTop(MachineLoop &L,
    903                                        const BlockFilterSet &LoopBlockSet) {
    904   // Check that the header hasn't been fused with a preheader block due to
    905   // crazy branches. If it has, we need to start with the header at the top to
    906   // prevent pulling the preheader into the loop body.
    907   BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
    908   if (!LoopBlockSet.count(*HeaderChain.begin()))
    909     return L.getHeader();
    910 
    911   DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(L.getHeader())
    912                << "\n");
    913 
    914   BlockFrequency BestPredFreq;
    915   MachineBasicBlock *BestPred = nullptr;
    916   for (MachineBasicBlock *Pred : L.getHeader()->predecessors()) {
    917     if (!LoopBlockSet.count(Pred))
    918       continue;
    919     DEBUG(dbgs() << "    header pred: " << getBlockName(Pred) << ", "
    920                  << Pred->succ_size() << " successors, ";
    921           MBFI->printBlockFreq(dbgs(), Pred) << " freq\n");
    922     if (Pred->succ_size() > 1)
    923       continue;
    924 
    925     BlockFrequency PredFreq = MBFI->getBlockFreq(Pred);
    926     if (!BestPred || PredFreq > BestPredFreq ||
    927         (!(PredFreq < BestPredFreq) &&
    928          Pred->isLayoutSuccessor(L.getHeader()))) {
    929       BestPred = Pred;
    930       BestPredFreq = PredFreq;
    931     }
    932   }
    933 
    934   // If no direct predecessor is fine, just use the loop header.
    935   if (!BestPred) {
    936     DEBUG(dbgs() << "    final top unchanged\n");
    937     return L.getHeader();
    938   }
    939 
    940   // Walk backwards through any straight line of predecessors.
    941   while (BestPred->pred_size() == 1 &&
    942          (*BestPred->pred_begin())->succ_size() == 1 &&
    943          *BestPred->pred_begin() != L.getHeader())
    944     BestPred = *BestPred->pred_begin();
    945 
    946   DEBUG(dbgs() << "    final top: " << getBlockName(BestPred) << "\n");
    947   return BestPred;
    948 }
    949 
    950 /// \brief Find the best loop exiting block for layout.
    951 ///
    952 /// This routine implements the logic to analyze the loop looking for the best
    953 /// block to layout at the top of the loop. Typically this is done to maximize
    954 /// fallthrough opportunities.
    955 MachineBasicBlock *
    956 MachineBlockPlacement::findBestLoopExit(MachineLoop &L,
    957                                         const BlockFilterSet &LoopBlockSet) {
    958   // We don't want to layout the loop linearly in all cases. If the loop header
    959   // is just a normal basic block in the loop, we want to look for what block
    960   // within the loop is the best one to layout at the top. However, if the loop
    961   // header has be pre-merged into a chain due to predecessors not having
    962   // analyzable branches, *and* the predecessor it is merged with is *not* part
    963   // of the loop, rotating the header into the middle of the loop will create
    964   // a non-contiguous range of blocks which is Very Bad. So start with the
    965   // header and only rotate if safe.
    966   BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
    967   if (!LoopBlockSet.count(*HeaderChain.begin()))
    968     return nullptr;
    969 
    970   BlockFrequency BestExitEdgeFreq;
    971   unsigned BestExitLoopDepth = 0;
    972   MachineBasicBlock *ExitingBB = nullptr;
    973   // If there are exits to outer loops, loop rotation can severely limit
    974   // fallthrough opportunites unless it selects such an exit. Keep a set of
    975   // blocks where rotating to exit with that block will reach an outer loop.
    976   SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
    977 
    978   DEBUG(dbgs() << "Finding best loop exit for: " << getBlockName(L.getHeader())
    979                << "\n");
    980   for (MachineBasicBlock *MBB : L.getBlocks()) {
    981     BlockChain &Chain = *BlockToChain[MBB];
    982     // Ensure that this block is at the end of a chain; otherwise it could be
    983     // mid-way through an inner loop or a successor of an unanalyzable branch.
    984     if (MBB != *std::prev(Chain.end()))
    985       continue;
    986 
    987     // Now walk the successors. We need to establish whether this has a viable
    988     // exiting successor and whether it has a viable non-exiting successor.
    989     // We store the old exiting state and restore it if a viable looping
    990     // successor isn't found.
    991     MachineBasicBlock *OldExitingBB = ExitingBB;
    992     BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
    993     bool HasLoopingSucc = false;
    994     for (MachineBasicBlock *Succ : MBB->successors()) {
    995       if (Succ->isEHPad())
    996         continue;
    997       if (Succ == MBB)
    998         continue;
    999       BlockChain &SuccChain = *BlockToChain[Succ];
   1000       // Don't split chains, either this chain or the successor's chain.
   1001       if (&Chain == &SuccChain) {
   1002         DEBUG(dbgs() << "    exiting: " << getBlockName(MBB) << " -> "
   1003                      << getBlockName(Succ) << " (chain conflict)\n");
   1004         continue;
   1005       }
   1006 
   1007       auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
   1008       if (LoopBlockSet.count(Succ)) {
   1009         DEBUG(dbgs() << "    looping: " << getBlockName(MBB) << " -> "
   1010                      << getBlockName(Succ) << " (" << SuccProb << ")\n");
   1011         HasLoopingSucc = true;
   1012         continue;
   1013       }
   1014 
   1015       unsigned SuccLoopDepth = 0;
   1016       if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
   1017         SuccLoopDepth = ExitLoop->getLoopDepth();
   1018         if (ExitLoop->contains(&L))
   1019           BlocksExitingToOuterLoop.insert(MBB);
   1020       }
   1021 
   1022       BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
   1023       DEBUG(dbgs() << "    exiting: " << getBlockName(MBB) << " -> "
   1024                    << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] (";
   1025             MBFI->printBlockFreq(dbgs(), ExitEdgeFreq) << ")\n");
   1026       // Note that we bias this toward an existing layout successor to retain
   1027       // incoming order in the absence of better information. The exit must have
   1028       // a frequency higher than the current exit before we consider breaking
   1029       // the layout.
   1030       BranchProbability Bias(100 - ExitBlockBias, 100);
   1031       if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
   1032           ExitEdgeFreq > BestExitEdgeFreq ||
   1033           (MBB->isLayoutSuccessor(Succ) &&
   1034            !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
   1035         BestExitEdgeFreq = ExitEdgeFreq;
   1036         ExitingBB = MBB;
   1037       }
   1038     }
   1039 
   1040     if (!HasLoopingSucc) {
   1041       // Restore the old exiting state, no viable looping successor was found.
   1042       ExitingBB = OldExitingBB;
   1043       BestExitEdgeFreq = OldBestExitEdgeFreq;
   1044     }
   1045   }
   1046   // Without a candidate exiting block or with only a single block in the
   1047   // loop, just use the loop header to layout the loop.
   1048   if (!ExitingBB || L.getNumBlocks() == 1)
   1049     return nullptr;
   1050 
   1051   // Also, if we have exit blocks which lead to outer loops but didn't select
   1052   // one of them as the exiting block we are rotating toward, disable loop
   1053   // rotation altogether.
   1054   if (!BlocksExitingToOuterLoop.empty() &&
   1055       !BlocksExitingToOuterLoop.count(ExitingBB))
   1056     return nullptr;
   1057 
   1058   DEBUG(dbgs() << "  Best exiting block: " << getBlockName(ExitingBB) << "\n");
   1059   return ExitingBB;
   1060 }
   1061 
   1062 /// \brief Attempt to rotate an exiting block to the bottom of the loop.
   1063 ///
   1064 /// Once we have built a chain, try to rotate it to line up the hot exit block
   1065 /// with fallthrough out of the loop if doing so doesn't introduce unnecessary
   1066 /// branches. For example, if the loop has fallthrough into its header and out
   1067 /// of its bottom already, don't rotate it.
   1068 void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
   1069                                        MachineBasicBlock *ExitingBB,
   1070                                        const BlockFilterSet &LoopBlockSet) {
   1071   if (!ExitingBB)
   1072     return;
   1073 
   1074   MachineBasicBlock *Top = *LoopChain.begin();
   1075   bool ViableTopFallthrough = false;
   1076   for (MachineBasicBlock *Pred : Top->predecessors()) {
   1077     BlockChain *PredChain = BlockToChain[Pred];
   1078     if (!LoopBlockSet.count(Pred) &&
   1079         (!PredChain || Pred == *std::prev(PredChain->end()))) {
   1080       ViableTopFallthrough = true;
   1081       break;
   1082     }
   1083   }
   1084 
   1085   // If the header has viable fallthrough, check whether the current loop
   1086   // bottom is a viable exiting block. If so, bail out as rotating will
   1087   // introduce an unnecessary branch.
   1088   if (ViableTopFallthrough) {
   1089     MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
   1090     for (MachineBasicBlock *Succ : Bottom->successors()) {
   1091       BlockChain *SuccChain = BlockToChain[Succ];
   1092       if (!LoopBlockSet.count(Succ) &&
   1093           (!SuccChain || Succ == *SuccChain->begin()))
   1094         return;
   1095     }
   1096   }
   1097 
   1098   BlockChain::iterator ExitIt =
   1099       std::find(LoopChain.begin(), LoopChain.end(), ExitingBB);
   1100   if (ExitIt == LoopChain.end())
   1101     return;
   1102 
   1103   std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
   1104 }
   1105 
   1106 /// \brief Attempt to rotate a loop based on profile data to reduce branch cost.
   1107 ///
   1108 /// With profile data, we can determine the cost in terms of missed fall through
   1109 /// opportunities when rotating a loop chain and select the best rotation.
   1110 /// Basically, there are three kinds of cost to consider for each rotation:
   1111 ///    1. The possibly missed fall through edge (if it exists) from BB out of
   1112 ///    the loop to the loop header.
   1113 ///    2. The possibly missed fall through edges (if they exist) from the loop
   1114 ///    exits to BB out of the loop.
   1115 ///    3. The missed fall through edge (if it exists) from the last BB to the
   1116 ///    first BB in the loop chain.
   1117 ///  Therefore, the cost for a given rotation is the sum of costs listed above.
   1118 ///  We select the best rotation with the smallest cost.
   1119 void MachineBlockPlacement::rotateLoopWithProfile(
   1120     BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet) {
   1121   auto HeaderBB = L.getHeader();
   1122   auto HeaderIter = std::find(LoopChain.begin(), LoopChain.end(), HeaderBB);
   1123   auto RotationPos = LoopChain.end();
   1124 
   1125   BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency();
   1126 
   1127   // A utility lambda that scales up a block frequency by dividing it by a
   1128   // branch probability which is the reciprocal of the scale.
   1129   auto ScaleBlockFrequency = [](BlockFrequency Freq,
   1130                                 unsigned Scale) -> BlockFrequency {
   1131     if (Scale == 0)
   1132       return 0;
   1133     // Use operator / between BlockFrequency and BranchProbability to implement
   1134     // saturating multiplication.
   1135     return Freq / BranchProbability(1, Scale);
   1136   };
   1137 
   1138   // Compute the cost of the missed fall-through edge to the loop header if the
   1139   // chain head is not the loop header. As we only consider natural loops with
   1140   // single header, this computation can be done only once.
   1141   BlockFrequency HeaderFallThroughCost(0);
   1142   for (auto *Pred : HeaderBB->predecessors()) {
   1143     BlockChain *PredChain = BlockToChain[Pred];
   1144     if (!LoopBlockSet.count(Pred) &&
   1145         (!PredChain || Pred == *std::prev(PredChain->end()))) {
   1146       auto EdgeFreq =
   1147           MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, HeaderBB);
   1148       auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
   1149       // If the predecessor has only an unconditional jump to the header, we
   1150       // need to consider the cost of this jump.
   1151       if (Pred->succ_size() == 1)
   1152         FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
   1153       HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
   1154     }
   1155   }
   1156 
   1157   // Here we collect all exit blocks in the loop, and for each exit we find out
   1158   // its hottest exit edge. For each loop rotation, we define the loop exit cost
   1159   // as the sum of frequencies of exit edges we collect here, excluding the exit
   1160   // edge from the tail of the loop chain.
   1161   SmallVector<std::pair<MachineBasicBlock *, BlockFrequency>, 4> ExitsWithFreq;
   1162   for (auto BB : LoopChain) {
   1163     auto LargestExitEdgeProb = BranchProbability::getZero();
   1164     for (auto *Succ : BB->successors()) {
   1165       BlockChain *SuccChain = BlockToChain[Succ];
   1166       if (!LoopBlockSet.count(Succ) &&
   1167           (!SuccChain || Succ == *SuccChain->begin())) {
   1168         auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
   1169         LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
   1170       }
   1171     }
   1172     if (LargestExitEdgeProb > BranchProbability::getZero()) {
   1173       auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
   1174       ExitsWithFreq.emplace_back(BB, ExitFreq);
   1175     }
   1176   }
   1177 
   1178   // In this loop we iterate every block in the loop chain and calculate the
   1179   // cost assuming the block is the head of the loop chain. When the loop ends,
   1180   // we should have found the best candidate as the loop chain's head.
   1181   for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
   1182             EndIter = LoopChain.end();
   1183        Iter != EndIter; Iter++, TailIter++) {
   1184     // TailIter is used to track the tail of the loop chain if the block we are
   1185     // checking (pointed by Iter) is the head of the chain.
   1186     if (TailIter == LoopChain.end())
   1187       TailIter = LoopChain.begin();
   1188 
   1189     auto TailBB = *TailIter;
   1190 
   1191     // Calculate the cost by putting this BB to the top.
   1192     BlockFrequency Cost = 0;
   1193 
   1194     // If the current BB is the loop header, we need to take into account the
   1195     // cost of the missed fall through edge from outside of the loop to the
   1196     // header.
   1197     if (Iter != HeaderIter)
   1198       Cost += HeaderFallThroughCost;
   1199 
   1200     // Collect the loop exit cost by summing up frequencies of all exit edges
   1201     // except the one from the chain tail.
   1202     for (auto &ExitWithFreq : ExitsWithFreq)
   1203       if (TailBB != ExitWithFreq.first)
   1204         Cost += ExitWithFreq.second;
   1205 
   1206     // The cost of breaking the once fall-through edge from the tail to the top
   1207     // of the loop chain. Here we need to consider three cases:
   1208     // 1. If the tail node has only one successor, then we will get an
   1209     //    additional jmp instruction. So the cost here is (MisfetchCost +
   1210     //    JumpInstCost) * tail node frequency.
   1211     // 2. If the tail node has two successors, then we may still get an
   1212     //    additional jmp instruction if the layout successor after the loop
   1213     //    chain is not its CFG successor. Note that the more frequently executed
   1214     //    jmp instruction will be put ahead of the other one. Assume the
   1215     //    frequency of those two branches are x and y, where x is the frequency
   1216     //    of the edge to the chain head, then the cost will be
   1217     //    (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
   1218     // 3. If the tail node has more than two successors (this rarely happens),
   1219     //    we won't consider any additional cost.
   1220     if (TailBB->isSuccessor(*Iter)) {
   1221       auto TailBBFreq = MBFI->getBlockFreq(TailBB);
   1222       if (TailBB->succ_size() == 1)
   1223         Cost += ScaleBlockFrequency(TailBBFreq.getFrequency(),
   1224                                     MisfetchCost + JumpInstCost);
   1225       else if (TailBB->succ_size() == 2) {
   1226         auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
   1227         auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
   1228         auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
   1229                                   ? TailBBFreq * TailToHeadProb.getCompl()
   1230                                   : TailToHeadFreq;
   1231         Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
   1232                 ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
   1233       }
   1234     }
   1235 
   1236     DEBUG(dbgs() << "The cost of loop rotation by making " << getBlockName(*Iter)
   1237                  << " to the top: " << Cost.getFrequency() << "\n");
   1238 
   1239     if (Cost < SmallestRotationCost) {
   1240       SmallestRotationCost = Cost;
   1241       RotationPos = Iter;
   1242     }
   1243   }
   1244 
   1245   if (RotationPos != LoopChain.end()) {
   1246     DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
   1247                  << " to the top\n");
   1248     std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
   1249   }
   1250 }
   1251 
   1252 /// \brief Collect blocks in the given loop that are to be placed.
   1253 ///
   1254 /// When profile data is available, exclude cold blocks from the returned set;
   1255 /// otherwise, collect all blocks in the loop.
   1256 MachineBlockPlacement::BlockFilterSet
   1257 MachineBlockPlacement::collectLoopBlockSet(MachineLoop &L) {
   1258   BlockFilterSet LoopBlockSet;
   1259 
   1260   // Filter cold blocks off from LoopBlockSet when profile data is available.
   1261   // Collect the sum of frequencies of incoming edges to the loop header from
   1262   // outside. If we treat the loop as a super block, this is the frequency of
   1263   // the loop. Then for each block in the loop, we calculate the ratio between
   1264   // its frequency and the frequency of the loop block. When it is too small,
   1265   // don't add it to the loop chain. If there are outer loops, then this block
   1266   // will be merged into the first outer loop chain for which this block is not
   1267   // cold anymore. This needs precise profile data and we only do this when
   1268   // profile data is available.
   1269   if (F->getFunction()->getEntryCount()) {
   1270     BlockFrequency LoopFreq(0);
   1271     for (auto LoopPred : L.getHeader()->predecessors())
   1272       if (!L.contains(LoopPred))
   1273         LoopFreq += MBFI->getBlockFreq(LoopPred) *
   1274                     MBPI->getEdgeProbability(LoopPred, L.getHeader());
   1275 
   1276     for (MachineBasicBlock *LoopBB : L.getBlocks()) {
   1277       auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
   1278       if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
   1279         continue;
   1280       LoopBlockSet.insert(LoopBB);
   1281     }
   1282   } else
   1283     LoopBlockSet.insert(L.block_begin(), L.block_end());
   1284 
   1285   return LoopBlockSet;
   1286 }
   1287 
   1288 /// \brief Forms basic block chains from the natural loop structures.
   1289 ///
   1290 /// These chains are designed to preserve the existing *structure* of the code
   1291 /// as much as possible. We can then stitch the chains together in a way which
   1292 /// both preserves the topological structure and minimizes taken conditional
   1293 /// branches.
   1294 void MachineBlockPlacement::buildLoopChains(MachineLoop &L) {
   1295   // First recurse through any nested loops, building chains for those inner
   1296   // loops.
   1297   for (MachineLoop *InnerLoop : L)
   1298     buildLoopChains(*InnerLoop);
   1299 
   1300   assert(BlockWorkList.empty());
   1301   assert(EHPadWorkList.empty());
   1302   BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
   1303 
   1304   // Check if we have profile data for this function. If yes, we will rotate
   1305   // this loop by modeling costs more precisely which requires the profile data
   1306   // for better layout.
   1307   bool RotateLoopWithProfile =
   1308       ForcePreciseRotationCost ||
   1309       (PreciseRotationCost && F->getFunction()->getEntryCount());
   1310 
   1311   // First check to see if there is an obviously preferable top block for the
   1312   // loop. This will default to the header, but may end up as one of the
   1313   // predecessors to the header if there is one which will result in strictly
   1314   // fewer branches in the loop body.
   1315   // When we use profile data to rotate the loop, this is unnecessary.
   1316   MachineBasicBlock *LoopTop =
   1317       RotateLoopWithProfile ? L.getHeader() : findBestLoopTop(L, LoopBlockSet);
   1318 
   1319   // If we selected just the header for the loop top, look for a potentially
   1320   // profitable exit block in the event that rotating the loop can eliminate
   1321   // branches by placing an exit edge at the bottom.
   1322   MachineBasicBlock *ExitingBB = nullptr;
   1323   if (!RotateLoopWithProfile && LoopTop == L.getHeader())
   1324     ExitingBB = findBestLoopExit(L, LoopBlockSet);
   1325 
   1326   BlockChain &LoopChain = *BlockToChain[LoopTop];
   1327 
   1328   // FIXME: This is a really lame way of walking the chains in the loop: we
   1329   // walk the blocks, and use a set to prevent visiting a particular chain
   1330   // twice.
   1331   SmallPtrSet<BlockChain *, 4> UpdatedPreds;
   1332   assert(LoopChain.UnscheduledPredecessors == 0);
   1333   UpdatedPreds.insert(&LoopChain);
   1334 
   1335   for (MachineBasicBlock *LoopBB : LoopBlockSet)
   1336     fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
   1337 
   1338   buildChain(LoopTop, LoopChain, &LoopBlockSet);
   1339 
   1340   if (RotateLoopWithProfile)
   1341     rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
   1342   else
   1343     rotateLoop(LoopChain, ExitingBB, LoopBlockSet);
   1344 
   1345   DEBUG({
   1346     // Crash at the end so we get all of the debugging output first.
   1347     bool BadLoop = false;
   1348     if (LoopChain.UnscheduledPredecessors) {
   1349       BadLoop = true;
   1350       dbgs() << "Loop chain contains a block without its preds placed!\n"
   1351              << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
   1352              << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
   1353     }
   1354     for (MachineBasicBlock *ChainBB : LoopChain) {
   1355       dbgs() << "          ... " << getBlockName(ChainBB) << "\n";
   1356       if (!LoopBlockSet.erase(ChainBB)) {
   1357         // We don't mark the loop as bad here because there are real situations
   1358         // where this can occur. For example, with an unanalyzable fallthrough
   1359         // from a loop block to a non-loop block or vice versa.
   1360         dbgs() << "Loop chain contains a block not contained by the loop!\n"
   1361                << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
   1362                << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
   1363                << "  Bad block:    " << getBlockName(ChainBB) << "\n";
   1364       }
   1365     }
   1366 
   1367     if (!LoopBlockSet.empty()) {
   1368       BadLoop = true;
   1369       for (MachineBasicBlock *LoopBB : LoopBlockSet)
   1370         dbgs() << "Loop contains blocks never placed into a chain!\n"
   1371                << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
   1372                << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
   1373                << "  Bad block:    " << getBlockName(LoopBB) << "\n";
   1374     }
   1375     assert(!BadLoop && "Detected problems with the placement of this loop.");
   1376   });
   1377 
   1378   BlockWorkList.clear();
   1379   EHPadWorkList.clear();
   1380 }
   1381 
   1382 /// When OutlineOpitonalBranches is on, this method colects BBs that
   1383 /// dominates all terminator blocks of the function \p F.
   1384 void MachineBlockPlacement::collectMustExecuteBBs() {
   1385   if (OutlineOptionalBranches) {
   1386     // Find the nearest common dominator of all of F's terminators.
   1387     MachineBasicBlock *Terminator = nullptr;
   1388     for (MachineBasicBlock &MBB : *F) {
   1389       if (MBB.succ_size() == 0) {
   1390         if (Terminator == nullptr)
   1391           Terminator = &MBB;
   1392         else
   1393           Terminator = MDT->findNearestCommonDominator(Terminator, &MBB);
   1394       }
   1395     }
   1396 
   1397     // MBBs dominating this common dominator are unavoidable.
   1398     UnavoidableBlocks.clear();
   1399     for (MachineBasicBlock &MBB : *F) {
   1400       if (MDT->dominates(&MBB, Terminator)) {
   1401         UnavoidableBlocks.insert(&MBB);
   1402       }
   1403     }
   1404   }
   1405 }
   1406 
   1407 void MachineBlockPlacement::buildCFGChains() {
   1408   // Ensure that every BB in the function has an associated chain to simplify
   1409   // the assumptions of the remaining algorithm.
   1410   SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
   1411   for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
   1412        ++FI) {
   1413     MachineBasicBlock *BB = &*FI;
   1414     BlockChain *Chain =
   1415         new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
   1416     // Also, merge any blocks which we cannot reason about and must preserve
   1417     // the exact fallthrough behavior for.
   1418     for (;;) {
   1419       Cond.clear();
   1420       MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
   1421       if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
   1422         break;
   1423 
   1424       MachineFunction::iterator NextFI = std::next(FI);
   1425       MachineBasicBlock *NextBB = &*NextFI;
   1426       // Ensure that the layout successor is a viable block, as we know that
   1427       // fallthrough is a possibility.
   1428       assert(NextFI != FE && "Can't fallthrough past the last block.");
   1429       DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
   1430                    << getBlockName(BB) << " -> " << getBlockName(NextBB)
   1431                    << "\n");
   1432       Chain->merge(NextBB, nullptr);
   1433       FI = NextFI;
   1434       BB = NextBB;
   1435     }
   1436   }
   1437 
   1438   // Turned on with OutlineOptionalBranches option
   1439   collectMustExecuteBBs();
   1440 
   1441   // Build any loop-based chains.
   1442   for (MachineLoop *L : *MLI)
   1443     buildLoopChains(*L);
   1444 
   1445   assert(BlockWorkList.empty());
   1446   assert(EHPadWorkList.empty());
   1447 
   1448   SmallPtrSet<BlockChain *, 4> UpdatedPreds;
   1449   for (MachineBasicBlock &MBB : *F)
   1450     fillWorkLists(&MBB, UpdatedPreds);
   1451 
   1452   BlockChain &FunctionChain = *BlockToChain[&F->front()];
   1453   buildChain(&F->front(), FunctionChain);
   1454 
   1455 #ifndef NDEBUG
   1456   typedef SmallPtrSet<MachineBasicBlock *, 16> FunctionBlockSetType;
   1457 #endif
   1458   DEBUG({
   1459     // Crash at the end so we get all of the debugging output first.
   1460     bool BadFunc = false;
   1461     FunctionBlockSetType FunctionBlockSet;
   1462     for (MachineBasicBlock &MBB : *F)
   1463       FunctionBlockSet.insert(&MBB);
   1464 
   1465     for (MachineBasicBlock *ChainBB : FunctionChain)
   1466       if (!FunctionBlockSet.erase(ChainBB)) {
   1467         BadFunc = true;
   1468         dbgs() << "Function chain contains a block not in the function!\n"
   1469                << "  Bad block:    " << getBlockName(ChainBB) << "\n";
   1470       }
   1471 
   1472     if (!FunctionBlockSet.empty()) {
   1473       BadFunc = true;
   1474       for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
   1475         dbgs() << "Function contains blocks never placed into a chain!\n"
   1476                << "  Bad block:    " << getBlockName(RemainingBB) << "\n";
   1477     }
   1478     assert(!BadFunc && "Detected problems with the block placement.");
   1479   });
   1480 
   1481   // Splice the blocks into place.
   1482   MachineFunction::iterator InsertPos = F->begin();
   1483   DEBUG(dbgs() << "[MBP] Function: "<< F->getName() << "\n");
   1484   for (MachineBasicBlock *ChainBB : FunctionChain) {
   1485     DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
   1486                                                        : "          ... ")
   1487                  << getBlockName(ChainBB) << "\n");
   1488     if (InsertPos != MachineFunction::iterator(ChainBB))
   1489       F->splice(InsertPos, ChainBB);
   1490     else
   1491       ++InsertPos;
   1492 
   1493     // Update the terminator of the previous block.
   1494     if (ChainBB == *FunctionChain.begin())
   1495       continue;
   1496     MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
   1497 
   1498     // FIXME: It would be awesome of updateTerminator would just return rather
   1499     // than assert when the branch cannot be analyzed in order to remove this
   1500     // boiler plate.
   1501     Cond.clear();
   1502     MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
   1503 
   1504     // The "PrevBB" is not yet updated to reflect current code layout, so,
   1505     //   o. it may fall-through to a block without explict "goto" instruction
   1506     //      before layout, and no longer fall-through it after layout; or
   1507     //   o. just opposite.
   1508     //
   1509     // analyzeBranch() may return erroneous value for FBB when these two
   1510     // situations take place. For the first scenario FBB is mistakenly set NULL;
   1511     // for the 2nd scenario, the FBB, which is expected to be NULL, is
   1512     // mistakenly pointing to "*BI".
   1513     // Thus, if the future change needs to use FBB before the layout is set, it
   1514     // has to correct FBB first by using the code similar to the following:
   1515     //
   1516     // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
   1517     //   PrevBB->updateTerminator();
   1518     //   Cond.clear();
   1519     //   TBB = FBB = nullptr;
   1520     //   if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
   1521     //     // FIXME: This should never take place.
   1522     //     TBB = FBB = nullptr;
   1523     //   }
   1524     // }
   1525     if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond))
   1526       PrevBB->updateTerminator();
   1527   }
   1528 
   1529   // Fixup the last block.
   1530   Cond.clear();
   1531   MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
   1532   if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond))
   1533     F->back().updateTerminator();
   1534 
   1535   BlockWorkList.clear();
   1536   EHPadWorkList.clear();
   1537 }
   1538 
   1539 void MachineBlockPlacement::optimizeBranches() {
   1540   BlockChain &FunctionChain = *BlockToChain[&F->front()];
   1541   SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
   1542 
   1543   // Now that all the basic blocks in the chain have the proper layout,
   1544   // make a final call to AnalyzeBranch with AllowModify set.
   1545   // Indeed, the target may be able to optimize the branches in a way we
   1546   // cannot because all branches may not be analyzable.
   1547   // E.g., the target may be able to remove an unconditional branch to
   1548   // a fallthrough when it occurs after predicated terminators.
   1549   for (MachineBasicBlock *ChainBB : FunctionChain) {
   1550     Cond.clear();
   1551     MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
   1552     if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
   1553       // If PrevBB has a two-way branch, try to re-order the branches
   1554       // such that we branch to the successor with higher probability first.
   1555       if (TBB && !Cond.empty() && FBB &&
   1556           MBPI->getEdgeProbability(ChainBB, FBB) >
   1557               MBPI->getEdgeProbability(ChainBB, TBB) &&
   1558           !TII->ReverseBranchCondition(Cond)) {
   1559         DEBUG(dbgs() << "Reverse order of the two branches: "
   1560                      << getBlockName(ChainBB) << "\n");
   1561         DEBUG(dbgs() << "    Edge probability: "
   1562                      << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
   1563                      << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
   1564         DebugLoc dl; // FIXME: this is nowhere
   1565         TII->RemoveBranch(*ChainBB);
   1566         TII->InsertBranch(*ChainBB, FBB, TBB, Cond, dl);
   1567         ChainBB->updateTerminator();
   1568       }
   1569     }
   1570   }
   1571 }
   1572 
   1573 void MachineBlockPlacement::alignBlocks() {
   1574   // Walk through the backedges of the function now that we have fully laid out
   1575   // the basic blocks and align the destination of each backedge. We don't rely
   1576   // exclusively on the loop info here so that we can align backedges in
   1577   // unnatural CFGs and backedges that were introduced purely because of the
   1578   // loop rotations done during this layout pass.
   1579   if (F->getFunction()->optForSize())
   1580     return;
   1581   BlockChain &FunctionChain = *BlockToChain[&F->front()];
   1582   if (FunctionChain.begin() == FunctionChain.end())
   1583     return; // Empty chain.
   1584 
   1585   const BranchProbability ColdProb(1, 5); // 20%
   1586   BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
   1587   BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
   1588   for (MachineBasicBlock *ChainBB : FunctionChain) {
   1589     if (ChainBB == *FunctionChain.begin())
   1590       continue;
   1591 
   1592     // Don't align non-looping basic blocks. These are unlikely to execute
   1593     // enough times to matter in practice. Note that we'll still handle
   1594     // unnatural CFGs inside of a natural outer loop (the common case) and
   1595     // rotated loops.
   1596     MachineLoop *L = MLI->getLoopFor(ChainBB);
   1597     if (!L)
   1598       continue;
   1599 
   1600     unsigned Align = TLI->getPrefLoopAlignment(L);
   1601     if (!Align)
   1602       continue; // Don't care about loop alignment.
   1603 
   1604     // If the block is cold relative to the function entry don't waste space
   1605     // aligning it.
   1606     BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
   1607     if (Freq < WeightedEntryFreq)
   1608       continue;
   1609 
   1610     // If the block is cold relative to its loop header, don't align it
   1611     // regardless of what edges into the block exist.
   1612     MachineBasicBlock *LoopHeader = L->getHeader();
   1613     BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
   1614     if (Freq < (LoopHeaderFreq * ColdProb))
   1615       continue;
   1616 
   1617     // Check for the existence of a non-layout predecessor which would benefit
   1618     // from aligning this block.
   1619     MachineBasicBlock *LayoutPred =
   1620         &*std::prev(MachineFunction::iterator(ChainBB));
   1621 
   1622     // Force alignment if all the predecessors are jumps. We already checked
   1623     // that the block isn't cold above.
   1624     if (!LayoutPred->isSuccessor(ChainBB)) {
   1625       ChainBB->setAlignment(Align);
   1626       continue;
   1627     }
   1628 
   1629     // Align this block if the layout predecessor's edge into this block is
   1630     // cold relative to the block. When this is true, other predecessors make up
   1631     // all of the hot entries into the block and thus alignment is likely to be
   1632     // important.
   1633     BranchProbability LayoutProb =
   1634         MBPI->getEdgeProbability(LayoutPred, ChainBB);
   1635     BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
   1636     if (LayoutEdgeFreq <= (Freq * ColdProb))
   1637       ChainBB->setAlignment(Align);
   1638   }
   1639 }
   1640 
   1641 bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   1642   if (skipFunction(*MF.getFunction()))
   1643     return false;
   1644 
   1645   // Check for single-block functions and skip them.
   1646   if (std::next(MF.begin()) == MF.end())
   1647     return false;
   1648 
   1649   F = &MF;
   1650   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
   1651   MBFI = llvm::make_unique<BranchFolder::MBFIWrapper>(
   1652       getAnalysis<MachineBlockFrequencyInfo>());
   1653   MLI = &getAnalysis<MachineLoopInfo>();
   1654   TII = MF.getSubtarget().getInstrInfo();
   1655   TLI = MF.getSubtarget().getTargetLowering();
   1656   MDT = &getAnalysis<MachineDominatorTree>();
   1657   assert(BlockToChain.empty());
   1658 
   1659   buildCFGChains();
   1660 
   1661   // Changing the layout can create new tail merging opportunities.
   1662   TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
   1663   // TailMerge can create jump into if branches that make CFG irreducible for
   1664   // HW that requires structurized CFG.
   1665   bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
   1666                          PassConfig->getEnableTailMerge() &&
   1667                          BranchFoldPlacement;
   1668   // No tail merging opportunities if the block number is less than four.
   1669   if (MF.size() > 3 && EnableTailMerge) {
   1670     BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
   1671                     *MBPI);
   1672 
   1673     if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
   1674                             getAnalysisIfAvailable<MachineModuleInfo>(), MLI,
   1675                             /*AfterBlockPlacement=*/true)) {
   1676       // Redo the layout if tail merging creates/removes/moves blocks.
   1677       BlockToChain.clear();
   1678       ChainAllocator.DestroyAll();
   1679       buildCFGChains();
   1680     }
   1681   }
   1682 
   1683   optimizeBranches();
   1684   alignBlocks();
   1685 
   1686   BlockToChain.clear();
   1687   ChainAllocator.DestroyAll();
   1688 
   1689   if (AlignAllBlock)
   1690     // Align all of the blocks in the function to a specific alignment.
   1691     for (MachineBasicBlock &MBB : MF)
   1692       MBB.setAlignment(AlignAllBlock);
   1693   else if (AlignAllNonFallThruBlocks) {
   1694     // Align all of the blocks that have no fall-through predecessors to a
   1695     // specific alignment.
   1696     for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
   1697       auto LayoutPred = std::prev(MBI);
   1698       if (!LayoutPred->isSuccessor(&*MBI))
   1699         MBI->setAlignment(AlignAllNonFallThruBlocks);
   1700     }
   1701   }
   1702 
   1703   // We always return true as we have no way to track whether the final order
   1704   // differs from the original order.
   1705   return true;
   1706 }
   1707 
   1708 namespace {
   1709 /// \brief A pass to compute block placement statistics.
   1710 ///
   1711 /// A separate pass to compute interesting statistics for evaluating block
   1712 /// placement. This is separate from the actual placement pass so that they can
   1713 /// be computed in the absence of any placement transformations or when using
   1714 /// alternative placement strategies.
   1715 class MachineBlockPlacementStats : public MachineFunctionPass {
   1716   /// \brief A handle to the branch probability pass.
   1717   const MachineBranchProbabilityInfo *MBPI;
   1718 
   1719   /// \brief A handle to the function-wide block frequency pass.
   1720   const MachineBlockFrequencyInfo *MBFI;
   1721 
   1722 public:
   1723   static char ID; // Pass identification, replacement for typeid
   1724   MachineBlockPlacementStats() : MachineFunctionPass(ID) {
   1725     initializeMachineBlockPlacementStatsPass(*PassRegistry::getPassRegistry());
   1726   }
   1727 
   1728   bool runOnMachineFunction(MachineFunction &F) override;
   1729 
   1730   void getAnalysisUsage(AnalysisUsage &AU) const override {
   1731     AU.addRequired<MachineBranchProbabilityInfo>();
   1732     AU.addRequired<MachineBlockFrequencyInfo>();
   1733     AU.setPreservesAll();
   1734     MachineFunctionPass::getAnalysisUsage(AU);
   1735   }
   1736 };
   1737 }
   1738 
   1739 char MachineBlockPlacementStats::ID = 0;
   1740 char &llvm::MachineBlockPlacementStatsID = MachineBlockPlacementStats::ID;
   1741 INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats",
   1742                       "Basic Block Placement Stats", false, false)
   1743 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
   1744 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
   1745 INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats",
   1746                     "Basic Block Placement Stats", false, false)
   1747 
   1748 bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
   1749   // Check for single-block functions and skip them.
   1750   if (std::next(F.begin()) == F.end())
   1751     return false;
   1752 
   1753   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
   1754   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
   1755 
   1756   for (MachineBasicBlock &MBB : F) {
   1757     BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
   1758     Statistic &NumBranches =
   1759         (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
   1760     Statistic &BranchTakenFreq =
   1761         (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
   1762     for (MachineBasicBlock *Succ : MBB.successors()) {
   1763       // Skip if this successor is a fallthrough.
   1764       if (MBB.isLayoutSuccessor(Succ))
   1765         continue;
   1766 
   1767       BlockFrequency EdgeFreq =
   1768           BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
   1769       ++NumBranches;
   1770       BranchTakenFreq += EdgeFreq.getFrequency();
   1771     }
   1772   }
   1773 
   1774   return false;
   1775 }
   1776