Home | History | Annotate | Download | only in X86
      1 //====- X86CmovConversion.cpp - Convert Cmov to Branch --------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 /// This file implements a pass that converts X86 cmov instructions into
     12 /// branches when profitable. This pass is conservative. It transforms if and
     13 /// only if it can guarantee a gain with high confidence.
     14 ///
     15 /// Thus, the optimization applies under the following conditions:
     16 ///   1. Consider as candidates only CMOVs in innermost loops (assume that
     17 ///      most hotspots are represented by these loops).
     18 ///   2. Given a group of CMOV instructions that are using the same EFLAGS def
     19 ///      instruction:
     20 ///      a. Consider them as candidates only if all have the same code condition
     21 ///         or the opposite one to prevent generating more than one conditional
     22 ///         jump per EFLAGS def instruction.
     23 ///      b. Consider them as candidates only if all are profitable to be
     24 ///         converted (assume that one bad conversion may cause a degradation).
     25 ///   3. Apply conversion only for loops that are found profitable and only for
     26 ///      CMOV candidates that were found profitable.
     27 ///      a. A loop is considered profitable only if conversion will reduce its
     28 ///         depth cost by some threshold.
     29 ///      b. CMOV is considered profitable if the cost of its condition is higher
     30 ///         than the average cost of its true-value and false-value by 25% of
     31 ///         branch-misprediction-penalty. This assures no degradation even with
     32 ///         25% branch misprediction.
     33 ///
     34 /// Note: This pass is assumed to run on SSA machine code.
     35 //
     36 //===----------------------------------------------------------------------===//
     37 //
     38 //  External interfaces:
     39 //      FunctionPass *llvm::createX86CmovConverterPass();
     40 //      bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF);
     41 //
     42 //===----------------------------------------------------------------------===//
     43 
     44 #include "X86.h"
     45 #include "X86InstrInfo.h"
     46 #include "llvm/ADT/ArrayRef.h"
     47 #include "llvm/ADT/DenseMap.h"
     48 #include "llvm/ADT/STLExtras.h"
     49 #include "llvm/ADT/SmallPtrSet.h"
     50 #include "llvm/ADT/SmallVector.h"
     51 #include "llvm/ADT/Statistic.h"
     52 #include "llvm/CodeGen/MachineBasicBlock.h"
     53 #include "llvm/CodeGen/MachineFunction.h"
     54 #include "llvm/CodeGen/MachineFunctionPass.h"
     55 #include "llvm/CodeGen/MachineInstr.h"
     56 #include "llvm/CodeGen/MachineInstrBuilder.h"
     57 #include "llvm/CodeGen/MachineLoopInfo.h"
     58 #include "llvm/CodeGen/MachineOperand.h"
     59 #include "llvm/CodeGen/MachineRegisterInfo.h"
     60 #include "llvm/CodeGen/TargetInstrInfo.h"
     61 #include "llvm/CodeGen/TargetRegisterInfo.h"
     62 #include "llvm/CodeGen/TargetSchedule.h"
     63 #include "llvm/CodeGen/TargetSubtargetInfo.h"
     64 #include "llvm/IR/DebugLoc.h"
     65 #include "llvm/MC/MCSchedule.h"
     66 #include "llvm/Pass.h"
     67 #include "llvm/Support/CommandLine.h"
     68 #include "llvm/Support/Debug.h"
     69 #include "llvm/Support/raw_ostream.h"
     70 #include <algorithm>
     71 #include <cassert>
     72 #include <iterator>
     73 #include <utility>
     74 
     75 using namespace llvm;
     76 
     77 #define DEBUG_TYPE "x86-cmov-conversion"
     78 
     79 STATISTIC(NumOfSkippedCmovGroups, "Number of unsupported CMOV-groups");
     80 STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates");
     81 STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops");
     82 STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups");
     83 
     84 namespace llvm {
     85 
     86 void initializeX86CmovConverterPassPass(PassRegistry &);
     87 
     88 } // end namespace llvm
     89 
     90 // This internal switch can be used to turn off the cmov/branch optimization.
     91 static cl::opt<bool>
     92     EnableCmovConverter("x86-cmov-converter",
     93                         cl::desc("Enable the X86 cmov-to-branch optimization."),
     94                         cl::init(true), cl::Hidden);
     95 
     96 static cl::opt<unsigned>
     97     GainCycleThreshold("x86-cmov-converter-threshold",
     98                        cl::desc("Minimum gain per loop (in cycles) threshold."),
     99                        cl::init(4), cl::Hidden);
    100 
    101 static cl::opt<bool> ForceMemOperand(
    102     "x86-cmov-converter-force-mem-operand",
    103     cl::desc("Convert cmovs to branches whenever they have memory operands."),
    104     cl::init(true), cl::Hidden);
    105 
    106 namespace {
    107 
    108 /// Converts X86 cmov instructions into branches when profitable.
    109 class X86CmovConverterPass : public MachineFunctionPass {
    110 public:
    111   X86CmovConverterPass() : MachineFunctionPass(ID) {
    112     initializeX86CmovConverterPassPass(*PassRegistry::getPassRegistry());
    113   }
    114 
    115   StringRef getPassName() const override { return "X86 cmov Conversion"; }
    116   bool runOnMachineFunction(MachineFunction &MF) override;
    117   void getAnalysisUsage(AnalysisUsage &AU) const override;
    118 
    119   /// Pass identification, replacement for typeid.
    120   static char ID;
    121 
    122 private:
    123   MachineRegisterInfo *MRI;
    124   const TargetInstrInfo *TII;
    125   const TargetRegisterInfo *TRI;
    126   TargetSchedModel TSchedModel;
    127 
    128   /// List of consecutive CMOV instructions.
    129   using CmovGroup = SmallVector<MachineInstr *, 2>;
    130   using CmovGroups = SmallVector<CmovGroup, 2>;
    131 
    132   /// Collect all CMOV-group-candidates in \p CurrLoop and update \p
    133   /// CmovInstGroups accordingly.
    134   ///
    135   /// \param Blocks List of blocks to process.
    136   /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
    137   /// \returns true iff it found any CMOV-group-candidate.
    138   bool collectCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
    139                              CmovGroups &CmovInstGroups,
    140                              bool IncludeLoads = false);
    141 
    142   /// Check if it is profitable to transform each CMOV-group-candidates into
    143   /// branch. Remove all groups that are not profitable from \p CmovInstGroups.
    144   ///
    145   /// \param Blocks List of blocks to process.
    146   /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
    147   /// \returns true iff any CMOV-group-candidate remain.
    148   bool checkForProfitableCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
    149                                         CmovGroups &CmovInstGroups);
    150 
    151   /// Convert the given list of consecutive CMOV instructions into a branch.
    152   ///
    153   /// \param Group Consecutive CMOV instructions to be converted into branch.
    154   void convertCmovInstsToBranches(SmallVectorImpl<MachineInstr *> &Group) const;
    155 };
    156 
    157 } // end anonymous namespace
    158 
    159 char X86CmovConverterPass::ID = 0;
    160 
    161 void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const {
    162   MachineFunctionPass::getAnalysisUsage(AU);
    163   AU.addRequired<MachineLoopInfo>();
    164 }
    165 
    166 bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
    167   if (skipFunction(MF.getFunction()))
    168     return false;
    169   if (!EnableCmovConverter)
    170     return false;
    171 
    172   LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
    173                     << "**********\n");
    174 
    175   bool Changed = false;
    176   MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
    177   const TargetSubtargetInfo &STI = MF.getSubtarget();
    178   MRI = &MF.getRegInfo();
    179   TII = STI.getInstrInfo();
    180   TRI = STI.getRegisterInfo();
    181   TSchedModel.init(&STI);
    182 
    183   // Before we handle the more subtle cases of register-register CMOVs inside
    184   // of potentially hot loops, we want to quickly remove all CMOVs with
    185   // a memory operand. The CMOV will risk a stall waiting for the load to
    186   // complete that speculative execution behind a branch is better suited to
    187   // handle on modern x86 chips.
    188   if (ForceMemOperand) {
    189     CmovGroups AllCmovGroups;
    190     SmallVector<MachineBasicBlock *, 4> Blocks;
    191     for (auto &MBB : MF)
    192       Blocks.push_back(&MBB);
    193     if (collectCmovCandidates(Blocks, AllCmovGroups, /*IncludeLoads*/ true)) {
    194       for (auto &Group : AllCmovGroups) {
    195         // Skip any group that doesn't do at least one memory operand cmov.
    196         if (!llvm::any_of(Group, [&](MachineInstr *I) { return I->mayLoad(); }))
    197           continue;
    198 
    199         // For CMOV groups which we can rewrite and which contain a memory load,
    200         // always rewrite them. On x86, a CMOV will dramatically amplify any
    201         // memory latency by blocking speculative execution.
    202         Changed = true;
    203         convertCmovInstsToBranches(Group);
    204       }
    205     }
    206   }
    207 
    208   //===--------------------------------------------------------------------===//
    209   // Register-operand Conversion Algorithm
    210   // ---------
    211   //   For each inner most loop
    212   //     collectCmovCandidates() {
    213   //       Find all CMOV-group-candidates.
    214   //     }
    215   //
    216   //     checkForProfitableCmovCandidates() {
    217   //       * Calculate both loop-depth and optimized-loop-depth.
    218   //       * Use these depth to check for loop transformation profitability.
    219   //       * Check for CMOV-group-candidate transformation profitability.
    220   //     }
    221   //
    222   //     For each profitable CMOV-group-candidate
    223   //       convertCmovInstsToBranches() {
    224   //           * Create FalseBB, SinkBB, Conditional branch to SinkBB.
    225   //           * Replace each CMOV instruction with a PHI instruction in SinkBB.
    226   //       }
    227   //
    228   // Note: For more details, see each function description.
    229   //===--------------------------------------------------------------------===//
    230 
    231   // Build up the loops in pre-order.
    232   SmallVector<MachineLoop *, 4> Loops(MLI.begin(), MLI.end());
    233   // Note that we need to check size on each iteration as we accumulate child
    234   // loops.
    235   for (int i = 0; i < (int)Loops.size(); ++i)
    236     for (MachineLoop *Child : Loops[i]->getSubLoops())
    237       Loops.push_back(Child);
    238 
    239   for (MachineLoop *CurrLoop : Loops) {
    240     // Optimize only inner most loops.
    241     if (!CurrLoop->getSubLoops().empty())
    242       continue;
    243 
    244     // List of consecutive CMOV instructions to be processed.
    245     CmovGroups CmovInstGroups;
    246 
    247     if (!collectCmovCandidates(CurrLoop->getBlocks(), CmovInstGroups))
    248       continue;
    249 
    250     if (!checkForProfitableCmovCandidates(CurrLoop->getBlocks(),
    251                                           CmovInstGroups))
    252       continue;
    253 
    254     Changed = true;
    255     for (auto &Group : CmovInstGroups)
    256       convertCmovInstsToBranches(Group);
    257   }
    258 
    259   return Changed;
    260 }
    261 
    262 bool X86CmovConverterPass::collectCmovCandidates(
    263     ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups,
    264     bool IncludeLoads) {
    265   //===--------------------------------------------------------------------===//
    266   // Collect all CMOV-group-candidates and add them into CmovInstGroups.
    267   //
    268   // CMOV-group:
    269   //   CMOV instructions, in same MBB, that uses same EFLAGS def instruction.
    270   //
    271   // CMOV-group-candidate:
    272   //   CMOV-group where all the CMOV instructions are
    273   //     1. consecutive.
    274   //     2. have same condition code or opposite one.
    275   //     3. have only operand registers (X86::CMOVrr).
    276   //===--------------------------------------------------------------------===//
    277   // List of possible improvement (TODO's):
    278   // --------------------------------------
    279   //   TODO: Add support for X86::CMOVrm instructions.
    280   //   TODO: Add support for X86::SETcc instructions.
    281   //   TODO: Add support for CMOV-groups with non consecutive CMOV instructions.
    282   //===--------------------------------------------------------------------===//
    283 
    284   // Current processed CMOV-Group.
    285   CmovGroup Group;
    286   for (auto *MBB : Blocks) {
    287     Group.clear();
    288     // Condition code of first CMOV instruction current processed range and its
    289     // opposite condition code.
    290     X86::CondCode FirstCC, FirstOppCC, MemOpCC;
    291     // Indicator of a non CMOVrr instruction in the current processed range.
    292     bool FoundNonCMOVInst = false;
    293     // Indicator for current processed CMOV-group if it should be skipped.
    294     bool SkipGroup = false;
    295 
    296     for (auto &I : *MBB) {
    297       // Skip debug instructions.
    298       if (I.isDebugInstr())
    299         continue;
    300       X86::CondCode CC = X86::getCondFromCMovOpc(I.getOpcode());
    301       // Check if we found a X86::CMOVrr instruction.
    302       if (CC != X86::COND_INVALID && (IncludeLoads || !I.mayLoad())) {
    303         if (Group.empty()) {
    304           // We found first CMOV in the range, reset flags.
    305           FirstCC = CC;
    306           FirstOppCC = X86::GetOppositeBranchCondition(CC);
    307           // Clear out the prior group's memory operand CC.
    308           MemOpCC = X86::COND_INVALID;
    309           FoundNonCMOVInst = false;
    310           SkipGroup = false;
    311         }
    312         Group.push_back(&I);
    313         // Check if it is a non-consecutive CMOV instruction or it has different
    314         // condition code than FirstCC or FirstOppCC.
    315         if (FoundNonCMOVInst || (CC != FirstCC && CC != FirstOppCC))
    316           // Mark the SKipGroup indicator to skip current processed CMOV-Group.
    317           SkipGroup = true;
    318         if (I.mayLoad()) {
    319           if (MemOpCC == X86::COND_INVALID)
    320             // The first memory operand CMOV.
    321             MemOpCC = CC;
    322           else if (CC != MemOpCC)
    323             // Can't handle mixed conditions with memory operands.
    324             SkipGroup = true;
    325         }
    326         // Check if we were relying on zero-extending behavior of the CMOV.
    327         if (!SkipGroup &&
    328             llvm::any_of(
    329                 MRI->use_nodbg_instructions(I.defs().begin()->getReg()),
    330                 [&](MachineInstr &UseI) {
    331                   return UseI.getOpcode() == X86::SUBREG_TO_REG;
    332                 }))
    333           // FIXME: We should model the cost of using an explicit MOV to handle
    334           // the zero-extension rather than just refusing to handle this.
    335           SkipGroup = true;
    336         continue;
    337       }
    338       // If Group is empty, keep looking for first CMOV in the range.
    339       if (Group.empty())
    340         continue;
    341 
    342       // We found a non X86::CMOVrr instruction.
    343       FoundNonCMOVInst = true;
    344       // Check if this instruction define EFLAGS, to determine end of processed
    345       // range, as there would be no more instructions using current EFLAGS def.
    346       if (I.definesRegister(X86::EFLAGS)) {
    347         // Check if current processed CMOV-group should not be skipped and add
    348         // it as a CMOV-group-candidate.
    349         if (!SkipGroup)
    350           CmovInstGroups.push_back(Group);
    351         else
    352           ++NumOfSkippedCmovGroups;
    353         Group.clear();
    354       }
    355     }
    356     // End of basic block is considered end of range, check if current processed
    357     // CMOV-group should not be skipped and add it as a CMOV-group-candidate.
    358     if (Group.empty())
    359       continue;
    360     if (!SkipGroup)
    361       CmovInstGroups.push_back(Group);
    362     else
    363       ++NumOfSkippedCmovGroups;
    364   }
    365 
    366   NumOfCmovGroupCandidate += CmovInstGroups.size();
    367   return !CmovInstGroups.empty();
    368 }
    369 
    370 /// \returns Depth of CMOV instruction as if it was converted into branch.
    371 /// \param TrueOpDepth depth cost of CMOV true value operand.
    372 /// \param FalseOpDepth depth cost of CMOV false value operand.
    373 static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) {
    374   //===--------------------------------------------------------------------===//
    375   // With no info about branch weight, we assume 50% for each value operand.
    376   // Thus, depth of optimized CMOV instruction is the rounded up average of
    377   // its True-Operand-Value-Depth and False-Operand-Value-Depth.
    378   //===--------------------------------------------------------------------===//
    379   return (TrueOpDepth + FalseOpDepth + 1) / 2;
    380 }
    381 
    382 bool X86CmovConverterPass::checkForProfitableCmovCandidates(
    383     ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups) {
    384   struct DepthInfo {
    385     /// Depth of original loop.
    386     unsigned Depth;
    387     /// Depth of optimized loop.
    388     unsigned OptDepth;
    389   };
    390   /// Number of loop iterations to calculate depth for ?!
    391   static const unsigned LoopIterations = 2;
    392   DenseMap<MachineInstr *, DepthInfo> DepthMap;
    393   DepthInfo LoopDepth[LoopIterations] = {{0, 0}, {0, 0}};
    394   enum { PhyRegType = 0, VirRegType = 1, RegTypeNum = 2 };
    395   /// For each register type maps the register to its last def instruction.
    396   DenseMap<unsigned, MachineInstr *> RegDefMaps[RegTypeNum];
    397   /// Maps register operand to its def instruction, which can be nullptr if it
    398   /// is unknown (e.g., operand is defined outside the loop).
    399   DenseMap<MachineOperand *, MachineInstr *> OperandToDefMap;
    400 
    401   // Set depth of unknown instruction (i.e., nullptr) to zero.
    402   DepthMap[nullptr] = {0, 0};
    403 
    404   SmallPtrSet<MachineInstr *, 4> CmovInstructions;
    405   for (auto &Group : CmovInstGroups)
    406     CmovInstructions.insert(Group.begin(), Group.end());
    407 
    408   //===--------------------------------------------------------------------===//
    409   // Step 1: Calculate instruction depth and loop depth.
    410   // Optimized-Loop:
    411   //   loop with CMOV-group-candidates converted into branches.
    412   //
    413   // Instruction-Depth:
    414   //   instruction latency + max operand depth.
    415   //     * For CMOV instruction in optimized loop the depth is calculated as:
    416   //       CMOV latency + getDepthOfOptCmov(True-Op-Depth, False-Op-depth)
    417   // TODO: Find a better way to estimate the latency of the branch instruction
    418   //       rather than using the CMOV latency.
    419   //
    420   // Loop-Depth:
    421   //   max instruction depth of all instructions in the loop.
    422   // Note: instruction with max depth represents the critical-path in the loop.
    423   //
    424   // Loop-Depth[i]:
    425   //   Loop-Depth calculated for first `i` iterations.
    426   //   Note: it is enough to calculate depth for up to two iterations.
    427   //
    428   // Depth-Diff[i]:
    429   //   Number of cycles saved in first 'i` iterations by optimizing the loop.
    430   //===--------------------------------------------------------------------===//
    431   for (unsigned I = 0; I < LoopIterations; ++I) {
    432     DepthInfo &MaxDepth = LoopDepth[I];
    433     for (auto *MBB : Blocks) {
    434       // Clear physical registers Def map.
    435       RegDefMaps[PhyRegType].clear();
    436       for (MachineInstr &MI : *MBB) {
    437         // Skip debug instructions.
    438         if (MI.isDebugInstr())
    439           continue;
    440         unsigned MIDepth = 0;
    441         unsigned MIDepthOpt = 0;
    442         bool IsCMOV = CmovInstructions.count(&MI);
    443         for (auto &MO : MI.uses()) {
    444           // Checks for "isUse()" as "uses()" returns also implicit definitions.
    445           if (!MO.isReg() || !MO.isUse())
    446             continue;
    447           unsigned Reg = MO.getReg();
    448           auto &RDM = RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)];
    449           if (MachineInstr *DefMI = RDM.lookup(Reg)) {
    450             OperandToDefMap[&MO] = DefMI;
    451             DepthInfo Info = DepthMap.lookup(DefMI);
    452             MIDepth = std::max(MIDepth, Info.Depth);
    453             if (!IsCMOV)
    454               MIDepthOpt = std::max(MIDepthOpt, Info.OptDepth);
    455           }
    456         }
    457 
    458         if (IsCMOV)
    459           MIDepthOpt = getDepthOfOptCmov(
    460               DepthMap[OperandToDefMap.lookup(&MI.getOperand(1))].OptDepth,
    461               DepthMap[OperandToDefMap.lookup(&MI.getOperand(2))].OptDepth);
    462 
    463         // Iterates over all operands to handle implicit definitions as well.
    464         for (auto &MO : MI.operands()) {
    465           if (!MO.isReg() || !MO.isDef())
    466             continue;
    467           unsigned Reg = MO.getReg();
    468           RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)][Reg] = &MI;
    469         }
    470 
    471         unsigned Latency = TSchedModel.computeInstrLatency(&MI);
    472         DepthMap[&MI] = {MIDepth += Latency, MIDepthOpt += Latency};
    473         MaxDepth.Depth = std::max(MaxDepth.Depth, MIDepth);
    474         MaxDepth.OptDepth = std::max(MaxDepth.OptDepth, MIDepthOpt);
    475       }
    476     }
    477   }
    478 
    479   unsigned Diff[LoopIterations] = {LoopDepth[0].Depth - LoopDepth[0].OptDepth,
    480                                    LoopDepth[1].Depth - LoopDepth[1].OptDepth};
    481 
    482   //===--------------------------------------------------------------------===//
    483   // Step 2: Check if Loop worth to be optimized.
    484   // Worth-Optimize-Loop:
    485   //   case 1: Diff[1] == Diff[0]
    486   //           Critical-path is iteration independent - there is no dependency
    487   //           of critical-path instructions on critical-path instructions of
    488   //           previous iteration.
    489   //           Thus, it is enough to check gain percent of 1st iteration -
    490   //           To be conservative, the optimized loop need to have a depth of
    491   //           12.5% cycles less than original loop, per iteration.
    492   //
    493   //   case 2: Diff[1] > Diff[0]
    494   //           Critical-path is iteration dependent - there is dependency of
    495   //           critical-path instructions on critical-path instructions of
    496   //           previous iteration.
    497   //           Thus, check the gain percent of the 2nd iteration (similar to the
    498   //           previous case), but it is also required to check the gradient of
    499   //           the gain - the change in Depth-Diff compared to the change in
    500   //           Loop-Depth between 1st and 2nd iterations.
    501   //           To be conservative, the gradient need to be at least 50%.
    502   //
    503   //   In addition, In order not to optimize loops with very small gain, the
    504   //   gain (in cycles) after 2nd iteration should not be less than a given
    505   //   threshold. Thus, the check (Diff[1] >= GainCycleThreshold) must apply.
    506   //
    507   // If loop is not worth optimizing, remove all CMOV-group-candidates.
    508   //===--------------------------------------------------------------------===//
    509   if (Diff[1] < GainCycleThreshold)
    510     return false;
    511 
    512   bool WorthOptLoop = false;
    513   if (Diff[1] == Diff[0])
    514     WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth;
    515   else if (Diff[1] > Diff[0])
    516     WorthOptLoop =
    517         (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth) &&
    518         (Diff[1] * 8 >= LoopDepth[1].Depth);
    519 
    520   if (!WorthOptLoop)
    521     return false;
    522 
    523   ++NumOfLoopCandidate;
    524 
    525   //===--------------------------------------------------------------------===//
    526   // Step 3: Check for each CMOV-group-candidate if it worth to be optimized.
    527   // Worth-Optimize-Group:
    528   //   Iff it worths to optimize all CMOV instructions in the group.
    529   //
    530   // Worth-Optimize-CMOV:
    531   //   Predicted branch is faster than CMOV by the difference between depth of
    532   //   condition operand and depth of taken (predicted) value operand.
    533   //   To be conservative, the gain of such CMOV transformation should cover at
    534   //   at least 25% of branch-misprediction-penalty.
    535   //===--------------------------------------------------------------------===//
    536   unsigned MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
    537   CmovGroups TempGroups;
    538   std::swap(TempGroups, CmovInstGroups);
    539   for (auto &Group : TempGroups) {
    540     bool WorthOpGroup = true;
    541     for (auto *MI : Group) {
    542       // Avoid CMOV instruction which value is used as a pointer to load from.
    543       // This is another conservative check to avoid converting CMOV instruction
    544       // used with tree-search like algorithm, where the branch is unpredicted.
    545       auto UIs = MRI->use_instructions(MI->defs().begin()->getReg());
    546       if (UIs.begin() != UIs.end() && ++UIs.begin() == UIs.end()) {
    547         unsigned Op = UIs.begin()->getOpcode();
    548         if (Op == X86::MOV64rm || Op == X86::MOV32rm) {
    549           WorthOpGroup = false;
    550           break;
    551         }
    552       }
    553 
    554       unsigned CondCost =
    555           DepthMap[OperandToDefMap.lookup(&MI->getOperand(3))].Depth;
    556       unsigned ValCost = getDepthOfOptCmov(
    557           DepthMap[OperandToDefMap.lookup(&MI->getOperand(1))].Depth,
    558           DepthMap[OperandToDefMap.lookup(&MI->getOperand(2))].Depth);
    559       if (ValCost > CondCost || (CondCost - ValCost) * 4 < MispredictPenalty) {
    560         WorthOpGroup = false;
    561         break;
    562       }
    563     }
    564 
    565     if (WorthOpGroup)
    566       CmovInstGroups.push_back(Group);
    567   }
    568 
    569   return !CmovInstGroups.empty();
    570 }
    571 
    572 static bool checkEFLAGSLive(MachineInstr *MI) {
    573   if (MI->killsRegister(X86::EFLAGS))
    574     return false;
    575 
    576   // The EFLAGS operand of MI might be missing a kill marker.
    577   // Figure out whether EFLAGS operand should LIVE after MI instruction.
    578   MachineBasicBlock *BB = MI->getParent();
    579   MachineBasicBlock::iterator ItrMI = MI;
    580 
    581   // Scan forward through BB for a use/def of EFLAGS.
    582   for (auto I = std::next(ItrMI), E = BB->end(); I != E; ++I) {
    583     if (I->readsRegister(X86::EFLAGS))
    584       return true;
    585     if (I->definesRegister(X86::EFLAGS))
    586       return false;
    587   }
    588 
    589   // We hit the end of the block, check whether EFLAGS is live into a successor.
    590   for (auto I = BB->succ_begin(), E = BB->succ_end(); I != E; ++I) {
    591     if ((*I)->isLiveIn(X86::EFLAGS))
    592       return true;
    593   }
    594 
    595   return false;
    596 }
    597 
    598 /// Given /p First CMOV instruction and /p Last CMOV instruction representing a
    599 /// group of CMOV instructions, which may contain debug instructions in between,
    600 /// move all debug instructions to after the last CMOV instruction, making the
    601 /// CMOV group consecutive.
    602 static void packCmovGroup(MachineInstr *First, MachineInstr *Last) {
    603   assert(X86::getCondFromCMovOpc(Last->getOpcode()) != X86::COND_INVALID &&
    604          "Last instruction in a CMOV group must be a CMOV instruction");
    605 
    606   SmallVector<MachineInstr *, 2> DBGInstructions;
    607   for (auto I = First->getIterator(), E = Last->getIterator(); I != E; I++) {
    608     if (I->isDebugInstr())
    609       DBGInstructions.push_back(&*I);
    610   }
    611 
    612   // Splice the debug instruction after the cmov group.
    613   MachineBasicBlock *MBB = First->getParent();
    614   for (auto *MI : DBGInstructions)
    615     MBB->insertAfter(Last, MI->removeFromParent());
    616 }
    617 
    618 void X86CmovConverterPass::convertCmovInstsToBranches(
    619     SmallVectorImpl<MachineInstr *> &Group) const {
    620   assert(!Group.empty() && "No CMOV instructions to convert");
    621   ++NumOfOptimizedCmovGroups;
    622 
    623   // If the CMOV group is not packed, e.g., there are debug instructions between
    624   // first CMOV and last CMOV, then pack the group and make the CMOV instruction
    625   // consecutive by moving the debug instructions to after the last CMOV.
    626   packCmovGroup(Group.front(), Group.back());
    627 
    628   // To convert a CMOVcc instruction, we actually have to insert the diamond
    629   // control-flow pattern.  The incoming instruction knows the destination vreg
    630   // to set, the condition code register to branch on, the true/false values to
    631   // select between, and a branch opcode to use.
    632 
    633   // Before
    634   // -----
    635   // MBB:
    636   //   cond = cmp ...
    637   //   v1 = CMOVge t1, f1, cond
    638   //   v2 = CMOVlt t2, f2, cond
    639   //   v3 = CMOVge v1, f3, cond
    640   //
    641   // After
    642   // -----
    643   // MBB:
    644   //   cond = cmp ...
    645   //   jge %SinkMBB
    646   //
    647   // FalseMBB:
    648   //   jmp %SinkMBB
    649   //
    650   // SinkMBB:
    651   //   %v1 = phi[%f1, %FalseMBB], [%t1, %MBB]
    652   //   %v2 = phi[%t2, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch
    653   //                                          ; true-value with false-value
    654   //   %v3 = phi[%f3, %FalseMBB], [%t1, %MBB] ; Phi instruction cannot use
    655   //                                          ; previous Phi instruction result
    656 
    657   MachineInstr &MI = *Group.front();
    658   MachineInstr *LastCMOV = Group.back();
    659   DebugLoc DL = MI.getDebugLoc();
    660 
    661   X86::CondCode CC = X86::CondCode(X86::getCondFromCMovOpc(MI.getOpcode()));
    662   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
    663   // Potentially swap the condition codes so that any memory operand to a CMOV
    664   // is in the *false* position instead of the *true* position. We can invert
    665   // any non-memory operand CMOV instructions to cope with this and we ensure
    666   // memory operand CMOVs are only included with a single condition code.
    667   if (llvm::any_of(Group, [&](MachineInstr *I) {
    668         return I->mayLoad() && X86::getCondFromCMovOpc(I->getOpcode()) == CC;
    669       }))
    670     std::swap(CC, OppCC);
    671 
    672   MachineBasicBlock *MBB = MI.getParent();
    673   MachineFunction::iterator It = ++MBB->getIterator();
    674   MachineFunction *F = MBB->getParent();
    675   const BasicBlock *BB = MBB->getBasicBlock();
    676 
    677   MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(BB);
    678   MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB);
    679   F->insert(It, FalseMBB);
    680   F->insert(It, SinkMBB);
    681 
    682   // If the EFLAGS register isn't dead in the terminator, then claim that it's
    683   // live into the sink and copy blocks.
    684   if (checkEFLAGSLive(LastCMOV)) {
    685     FalseMBB->addLiveIn(X86::EFLAGS);
    686     SinkMBB->addLiveIn(X86::EFLAGS);
    687   }
    688 
    689   // Transfer the remainder of BB and its successor edges to SinkMBB.
    690   SinkMBB->splice(SinkMBB->begin(), MBB,
    691                   std::next(MachineBasicBlock::iterator(LastCMOV)), MBB->end());
    692   SinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
    693 
    694   // Add the false and sink blocks as its successors.
    695   MBB->addSuccessor(FalseMBB);
    696   MBB->addSuccessor(SinkMBB);
    697 
    698   // Create the conditional branch instruction.
    699   BuildMI(MBB, DL, TII->get(X86::GetCondBranchFromCond(CC))).addMBB(SinkMBB);
    700 
    701   // Add the sink block to the false block successors.
    702   FalseMBB->addSuccessor(SinkMBB);
    703 
    704   MachineInstrBuilder MIB;
    705   MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
    706   MachineBasicBlock::iterator MIItEnd =
    707       std::next(MachineBasicBlock::iterator(LastCMOV));
    708   MachineBasicBlock::iterator FalseInsertionPoint = FalseMBB->begin();
    709   MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
    710 
    711   // First we need to insert an explicit load on the false path for any memory
    712   // operand. We also need to potentially do register rewriting here, but it is
    713   // simpler as the memory operands are always on the false path so we can
    714   // simply take that input, whatever it is.
    715   DenseMap<unsigned, unsigned> FalseBBRegRewriteTable;
    716   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;) {
    717     auto &MI = *MIIt++;
    718     // Skip any CMOVs in this group which don't load from memory.
    719     if (!MI.mayLoad()) {
    720       // Remember the false-side register input.
    721       unsigned FalseReg =
    722           MI.getOperand(X86::getCondFromCMovOpc(MI.getOpcode()) == CC ? 1 : 2)
    723               .getReg();
    724       // Walk back through any intermediate cmovs referenced.
    725       while (true) {
    726         auto FRIt = FalseBBRegRewriteTable.find(FalseReg);
    727         if (FRIt == FalseBBRegRewriteTable.end())
    728           break;
    729         FalseReg = FRIt->second;
    730       }
    731       FalseBBRegRewriteTable[MI.getOperand(0).getReg()] = FalseReg;
    732       continue;
    733     }
    734 
    735     // The condition must be the *opposite* of the one we've decided to branch
    736     // on as the branch will go *around* the load and the load should happen
    737     // when the CMOV condition is false.
    738     assert(X86::getCondFromCMovOpc(MI.getOpcode()) == OppCC &&
    739            "Can only handle memory-operand cmov instructions with a condition "
    740            "opposite to the selected branch direction.");
    741 
    742     // The goal is to rewrite the cmov from:
    743     //
    744     //   MBB:
    745     //     %A = CMOVcc %B (tied), (mem)
    746     //
    747     // to
    748     //
    749     //   MBB:
    750     //     %A = CMOVcc %B (tied), %C
    751     //   FalseMBB:
    752     //     %C = MOV (mem)
    753     //
    754     // Which will allow the next loop to rewrite the CMOV in terms of a PHI:
    755     //
    756     //   MBB:
    757     //     JMP!cc SinkMBB
    758     //   FalseMBB:
    759     //     %C = MOV (mem)
    760     //   SinkMBB:
    761     //     %A = PHI [ %C, FalseMBB ], [ %B, MBB]
    762 
    763     // Get a fresh register to use as the destination of the MOV.
    764     const TargetRegisterClass *RC = MRI->getRegClass(MI.getOperand(0).getReg());
    765     unsigned TmpReg = MRI->createVirtualRegister(RC);
    766 
    767     SmallVector<MachineInstr *, 4> NewMIs;
    768     bool Unfolded = TII->unfoldMemoryOperand(*MBB->getParent(), MI, TmpReg,
    769                                              /*UnfoldLoad*/ true,
    770                                              /*UnfoldStore*/ false, NewMIs);
    771     (void)Unfolded;
    772     assert(Unfolded && "Should never fail to unfold a loading cmov!");
    773 
    774     // Move the new CMOV to just before the old one and reset any impacted
    775     // iterator.
    776     auto *NewCMOV = NewMIs.pop_back_val();
    777     assert(X86::getCondFromCMovOpc(NewCMOV->getOpcode()) == OppCC &&
    778            "Last new instruction isn't the expected CMOV!");
    779     LLVM_DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump());
    780     MBB->insert(MachineBasicBlock::iterator(MI), NewCMOV);
    781     if (&*MIItBegin == &MI)
    782       MIItBegin = MachineBasicBlock::iterator(NewCMOV);
    783 
    784     // Sink whatever instructions were needed to produce the unfolded operand
    785     // into the false block.
    786     for (auto *NewMI : NewMIs) {
    787       LLVM_DEBUG(dbgs() << "\tRewritten load instr: "; NewMI->dump());
    788       FalseMBB->insert(FalseInsertionPoint, NewMI);
    789       // Re-map any operands that are from other cmovs to the inputs for this block.
    790       for (auto &MOp : NewMI->uses()) {
    791         if (!MOp.isReg())
    792           continue;
    793         auto It = FalseBBRegRewriteTable.find(MOp.getReg());
    794         if (It == FalseBBRegRewriteTable.end())
    795           continue;
    796 
    797         MOp.setReg(It->second);
    798         // This might have been a kill when it referenced the cmov result, but
    799         // it won't necessarily be once rewritten.
    800         // FIXME: We could potentially improve this by tracking whether the
    801         // operand to the cmov was also a kill, and then skipping the PHI node
    802         // construction below.
    803         MOp.setIsKill(false);
    804       }
    805     }
    806     MBB->erase(MachineBasicBlock::iterator(MI),
    807                std::next(MachineBasicBlock::iterator(MI)));
    808 
    809     // Add this PHI to the rewrite table.
    810     FalseBBRegRewriteTable[NewCMOV->getOperand(0).getReg()] = TmpReg;
    811   }
    812 
    813   // As we are creating the PHIs, we have to be careful if there is more than
    814   // one.  Later CMOVs may reference the results of earlier CMOVs, but later
    815   // PHIs have to reference the individual true/false inputs from earlier PHIs.
    816   // That also means that PHI construction must work forward from earlier to
    817   // later, and that the code must maintain a mapping from earlier PHI's
    818   // destination registers, and the registers that went into the PHI.
    819   DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
    820 
    821   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
    822     unsigned DestReg = MIIt->getOperand(0).getReg();
    823     unsigned Op1Reg = MIIt->getOperand(1).getReg();
    824     unsigned Op2Reg = MIIt->getOperand(2).getReg();
    825 
    826     // If this CMOV we are processing is the opposite condition from the jump we
    827     // generated, then we have to swap the operands for the PHI that is going to
    828     // be generated.
    829     if (X86::getCondFromCMovOpc(MIIt->getOpcode()) == OppCC)
    830       std::swap(Op1Reg, Op2Reg);
    831 
    832     auto Op1Itr = RegRewriteTable.find(Op1Reg);
    833     if (Op1Itr != RegRewriteTable.end())
    834       Op1Reg = Op1Itr->second.first;
    835 
    836     auto Op2Itr = RegRewriteTable.find(Op2Reg);
    837     if (Op2Itr != RegRewriteTable.end())
    838       Op2Reg = Op2Itr->second.second;
    839 
    840     //  SinkMBB:
    841     //   %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, MBB ]
    842     //  ...
    843     MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
    844               .addReg(Op1Reg)
    845               .addMBB(FalseMBB)
    846               .addReg(Op2Reg)
    847               .addMBB(MBB);
    848     (void)MIB;
    849     LLVM_DEBUG(dbgs() << "\tFrom: "; MIIt->dump());
    850     LLVM_DEBUG(dbgs() << "\tTo: "; MIB->dump());
    851 
    852     // Add this PHI to the rewrite table.
    853     RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
    854   }
    855 
    856   // Now remove the CMOV(s).
    857   MBB->erase(MIItBegin, MIItEnd);
    858 }
    859 
    860 INITIALIZE_PASS_BEGIN(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion",
    861                       false, false)
    862 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
    863 INITIALIZE_PASS_END(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion",
    864                     false, false)
    865 
    866 FunctionPass *llvm::createX86CmovConverterPass() {
    867   return new X86CmovConverterPass();
    868 }
    869