Home | History | Annotate | Download | only in Analysis
      1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file implements inline cost analysis.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 #include "llvm/Analysis/InlineCost.h"
     15 #include "llvm/ADT/STLExtras.h"
     16 #include "llvm/ADT/SetVector.h"
     17 #include "llvm/ADT/SmallPtrSet.h"
     18 #include "llvm/ADT/SmallVector.h"
     19 #include "llvm/ADT/Statistic.h"
     20 #include "llvm/Analysis/AssumptionCache.h"
     21 #include "llvm/Analysis/CodeMetrics.h"
     22 #include "llvm/Analysis/ConstantFolding.h"
     23 #include "llvm/Analysis/InstructionSimplify.h"
     24 #include "llvm/Analysis/ProfileSummaryInfo.h"
     25 #include "llvm/Analysis/TargetTransformInfo.h"
     26 #include "llvm/IR/CallSite.h"
     27 #include "llvm/IR/CallingConv.h"
     28 #include "llvm/IR/DataLayout.h"
     29 #include "llvm/IR/GetElementPtrTypeIterator.h"
     30 #include "llvm/IR/GlobalAlias.h"
     31 #include "llvm/IR/InstVisitor.h"
     32 #include "llvm/IR/IntrinsicInst.h"
     33 #include "llvm/IR/Operator.h"
     34 #include "llvm/Support/Debug.h"
     35 #include "llvm/Support/raw_ostream.h"
     36 
     37 using namespace llvm;
     38 
     39 #define DEBUG_TYPE "inline-cost"
     40 
     41 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
     42 
     43 // Threshold to use when optsize is specified (and there is no
     44 // -inline-threshold).
     45 const int OptSizeThreshold = 75;
     46 
     47 // Threshold to use when -Oz is specified (and there is no -inline-threshold).
     48 const int OptMinSizeThreshold = 25;
     49 
     50 // Threshold to use when -O[34] is specified (and there is no
     51 // -inline-threshold).
     52 const int OptAggressiveThreshold = 275;
     53 
     54 static cl::opt<int> DefaultInlineThreshold(
     55     "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
     56     cl::desc("Control the amount of inlining to perform (default = 225)"));
     57 
     58 static cl::opt<int> HintThreshold(
     59     "inlinehint-threshold", cl::Hidden, cl::init(325),
     60     cl::desc("Threshold for inlining functions with inline hint"));
     61 
     62 // We introduce this threshold to help performance of instrumentation based
     63 // PGO before we actually hook up inliner with analysis passes such as BPI and
     64 // BFI.
     65 static cl::opt<int> ColdThreshold(
     66     "inlinecold-threshold", cl::Hidden, cl::init(225),
     67     cl::desc("Threshold for inlining functions with cold attribute"));
     68 
     69 namespace {
     70 
     71 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
     72   typedef InstVisitor<CallAnalyzer, bool> Base;
     73   friend class InstVisitor<CallAnalyzer, bool>;
     74 
     75   /// The TargetTransformInfo available for this compilation.
     76   const TargetTransformInfo &TTI;
     77 
     78   /// The cache of @llvm.assume intrinsics.
     79   AssumptionCacheTracker *ACT;
     80 
     81   /// Profile summary information.
     82   ProfileSummaryInfo *PSI;
     83 
     84   // The called function.
     85   Function &F;
     86 
     87   // The candidate callsite being analyzed. Please do not use this to do
     88   // analysis in the caller function; we want the inline cost query to be
     89   // easily cacheable. Instead, use the cover function paramHasAttr.
     90   CallSite CandidateCS;
     91 
     92   int Threshold;
     93   int Cost;
     94 
     95   bool IsCallerRecursive;
     96   bool IsRecursiveCall;
     97   bool ExposesReturnsTwice;
     98   bool HasDynamicAlloca;
     99   bool ContainsNoDuplicateCall;
    100   bool HasReturn;
    101   bool HasIndirectBr;
    102   bool HasFrameEscape;
    103 
    104   /// Number of bytes allocated statically by the callee.
    105   uint64_t AllocatedSize;
    106   unsigned NumInstructions, NumVectorInstructions;
    107   int FiftyPercentVectorBonus, TenPercentVectorBonus;
    108   int VectorBonus;
    109 
    110   // While we walk the potentially-inlined instructions, we build up and
    111   // maintain a mapping of simplified values specific to this callsite. The
    112   // idea is to propagate any special information we have about arguments to
    113   // this call through the inlinable section of the function, and account for
    114   // likely simplifications post-inlining. The most important aspect we track
    115   // is CFG altering simplifications -- when we prove a basic block dead, that
    116   // can cause dramatic shifts in the cost of inlining a function.
    117   DenseMap<Value *, Constant *> SimplifiedValues;
    118 
    119   // Keep track of the values which map back (through function arguments) to
    120   // allocas on the caller stack which could be simplified through SROA.
    121   DenseMap<Value *, Value *> SROAArgValues;
    122 
    123   // The mapping of caller Alloca values to their accumulated cost savings. If
    124   // we have to disable SROA for one of the allocas, this tells us how much
    125   // cost must be added.
    126   DenseMap<Value *, int> SROAArgCosts;
    127 
    128   // Keep track of values which map to a pointer base and constant offset.
    129   DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
    130 
    131   // Custom simplification helper routines.
    132   bool isAllocaDerivedArg(Value *V);
    133   bool lookupSROAArgAndCost(Value *V, Value *&Arg,
    134                             DenseMap<Value *, int>::iterator &CostIt);
    135   void disableSROA(DenseMap<Value *, int>::iterator CostIt);
    136   void disableSROA(Value *V);
    137   void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
    138                           int InstructionCost);
    139   bool isGEPOffsetConstant(GetElementPtrInst &GEP);
    140   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
    141   bool simplifyCallSite(Function *F, CallSite CS);
    142   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
    143 
    144   /// Return true if the given argument to the function being considered for
    145   /// inlining has the given attribute set either at the call site or the
    146   /// function declaration.  Primarily used to inspect call site specific
    147   /// attributes since these can be more precise than the ones on the callee
    148   /// itself.
    149   bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
    150 
    151   /// Return true if the given value is known non null within the callee if
    152   /// inlined through this particular callsite.
    153   bool isKnownNonNullInCallee(Value *V);
    154 
    155   /// Update Threshold based on callsite properties such as callee
    156   /// attributes and callee hotness for PGO builds. The Callee is explicitly
    157   /// passed to support analyzing indirect calls whose target is inferred by
    158   /// analysis.
    159   void updateThreshold(CallSite CS, Function &Callee);
    160 
    161   /// Return true if size growth is allowed when inlining the callee at CS.
    162   bool allowSizeGrowth(CallSite CS);
    163 
    164   // Custom analysis routines.
    165   bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
    166 
    167   // Disable several entry points to the visitor so we don't accidentally use
    168   // them by declaring but not defining them here.
    169   void visit(Module *);
    170   void visit(Module &);
    171   void visit(Function *);
    172   void visit(Function &);
    173   void visit(BasicBlock *);
    174   void visit(BasicBlock &);
    175 
    176   // Provide base case for our instruction visit.
    177   bool visitInstruction(Instruction &I);
    178 
    179   // Our visit overrides.
    180   bool visitAlloca(AllocaInst &I);
    181   bool visitPHI(PHINode &I);
    182   bool visitGetElementPtr(GetElementPtrInst &I);
    183   bool visitBitCast(BitCastInst &I);
    184   bool visitPtrToInt(PtrToIntInst &I);
    185   bool visitIntToPtr(IntToPtrInst &I);
    186   bool visitCastInst(CastInst &I);
    187   bool visitUnaryInstruction(UnaryInstruction &I);
    188   bool visitCmpInst(CmpInst &I);
    189   bool visitSub(BinaryOperator &I);
    190   bool visitBinaryOperator(BinaryOperator &I);
    191   bool visitLoad(LoadInst &I);
    192   bool visitStore(StoreInst &I);
    193   bool visitExtractValue(ExtractValueInst &I);
    194   bool visitInsertValue(InsertValueInst &I);
    195   bool visitCallSite(CallSite CS);
    196   bool visitReturnInst(ReturnInst &RI);
    197   bool visitBranchInst(BranchInst &BI);
    198   bool visitSwitchInst(SwitchInst &SI);
    199   bool visitIndirectBrInst(IndirectBrInst &IBI);
    200   bool visitResumeInst(ResumeInst &RI);
    201   bool visitCleanupReturnInst(CleanupReturnInst &RI);
    202   bool visitCatchReturnInst(CatchReturnInst &RI);
    203   bool visitUnreachableInst(UnreachableInst &I);
    204 
    205 public:
    206   CallAnalyzer(const TargetTransformInfo &TTI, AssumptionCacheTracker *ACT,
    207                ProfileSummaryInfo *PSI, Function &Callee, int Threshold,
    208                CallSite CSArg)
    209       : TTI(TTI), ACT(ACT), PSI(PSI), F(Callee), CandidateCS(CSArg),
    210         Threshold(Threshold), Cost(0), IsCallerRecursive(false),
    211         IsRecursiveCall(false), ExposesReturnsTwice(false),
    212         HasDynamicAlloca(false), ContainsNoDuplicateCall(false),
    213         HasReturn(false), HasIndirectBr(false), HasFrameEscape(false),
    214         AllocatedSize(0), NumInstructions(0), NumVectorInstructions(0),
    215         FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0),
    216         NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
    217         NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
    218         NumInstructionsSimplified(0), SROACostSavings(0),
    219         SROACostSavingsLost(0) {}
    220 
    221   bool analyzeCall(CallSite CS);
    222 
    223   int getThreshold() { return Threshold; }
    224   int getCost() { return Cost; }
    225 
    226   // Keep a bunch of stats about the cost savings found so we can print them
    227   // out when debugging.
    228   unsigned NumConstantArgs;
    229   unsigned NumConstantOffsetPtrArgs;
    230   unsigned NumAllocaArgs;
    231   unsigned NumConstantPtrCmps;
    232   unsigned NumConstantPtrDiffs;
    233   unsigned NumInstructionsSimplified;
    234   unsigned SROACostSavings;
    235   unsigned SROACostSavingsLost;
    236 
    237   void dump();
    238 };
    239 
    240 } // namespace
    241 
    242 /// \brief Test whether the given value is an Alloca-derived function argument.
    243 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
    244   return SROAArgValues.count(V);
    245 }
    246 
    247 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to.
    248 /// Returns false if V does not map to a SROA-candidate.
    249 bool CallAnalyzer::lookupSROAArgAndCost(
    250     Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
    251   if (SROAArgValues.empty() || SROAArgCosts.empty())
    252     return false;
    253 
    254   DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
    255   if (ArgIt == SROAArgValues.end())
    256     return false;
    257 
    258   Arg = ArgIt->second;
    259   CostIt = SROAArgCosts.find(Arg);
    260   return CostIt != SROAArgCosts.end();
    261 }
    262 
    263 /// \brief Disable SROA for the candidate marked by this cost iterator.
    264 ///
    265 /// This marks the candidate as no longer viable for SROA, and adds the cost
    266 /// savings associated with it back into the inline cost measurement.
    267 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
    268   // If we're no longer able to perform SROA we need to undo its cost savings
    269   // and prevent subsequent analysis.
    270   Cost += CostIt->second;
    271   SROACostSavings -= CostIt->second;
    272   SROACostSavingsLost += CostIt->second;
    273   SROAArgCosts.erase(CostIt);
    274 }
    275 
    276 /// \brief If 'V' maps to a SROA candidate, disable SROA for it.
    277 void CallAnalyzer::disableSROA(Value *V) {
    278   Value *SROAArg;
    279   DenseMap<Value *, int>::iterator CostIt;
    280   if (lookupSROAArgAndCost(V, SROAArg, CostIt))
    281     disableSROA(CostIt);
    282 }
    283 
    284 /// \brief Accumulate the given cost for a particular SROA candidate.
    285 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
    286                                       int InstructionCost) {
    287   CostIt->second += InstructionCost;
    288   SROACostSavings += InstructionCost;
    289 }
    290 
    291 /// \brief Check whether a GEP's indices are all constant.
    292 ///
    293 /// Respects any simplified values known during the analysis of this callsite.
    294 bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) {
    295   for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
    296     if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
    297       return false;
    298 
    299   return true;
    300 }
    301 
    302 /// \brief Accumulate a constant GEP offset into an APInt if possible.
    303 ///
    304 /// Returns false if unable to compute the offset for any reason. Respects any
    305 /// simplified values known during the analysis of this callsite.
    306 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
    307   const DataLayout &DL = F.getParent()->getDataLayout();
    308   unsigned IntPtrWidth = DL.getPointerSizeInBits();
    309   assert(IntPtrWidth == Offset.getBitWidth());
    310 
    311   for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
    312        GTI != GTE; ++GTI) {
    313     ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
    314     if (!OpC)
    315       if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
    316         OpC = dyn_cast<ConstantInt>(SimpleOp);
    317     if (!OpC)
    318       return false;
    319     if (OpC->isZero())
    320       continue;
    321 
    322     // Handle a struct index, which adds its field offset to the pointer.
    323     if (StructType *STy = dyn_cast<StructType>(*GTI)) {
    324       unsigned ElementIdx = OpC->getZExtValue();
    325       const StructLayout *SL = DL.getStructLayout(STy);
    326       Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
    327       continue;
    328     }
    329 
    330     APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
    331     Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
    332   }
    333   return true;
    334 }
    335 
    336 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
    337   // Check whether inlining will turn a dynamic alloca into a static
    338   // alloca and handle that case.
    339   if (I.isArrayAllocation()) {
    340     Constant *Size = SimplifiedValues.lookup(I.getArraySize());
    341     if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
    342       const DataLayout &DL = F.getParent()->getDataLayout();
    343       Type *Ty = I.getAllocatedType();
    344       AllocatedSize = SaturatingMultiplyAdd(
    345           AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize);
    346       return Base::visitAlloca(I);
    347     }
    348   }
    349 
    350   // Accumulate the allocated size.
    351   if (I.isStaticAlloca()) {
    352     const DataLayout &DL = F.getParent()->getDataLayout();
    353     Type *Ty = I.getAllocatedType();
    354     AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize);
    355   }
    356 
    357   // We will happily inline static alloca instructions.
    358   if (I.isStaticAlloca())
    359     return Base::visitAlloca(I);
    360 
    361   // FIXME: This is overly conservative. Dynamic allocas are inefficient for
    362   // a variety of reasons, and so we would like to not inline them into
    363   // functions which don't currently have a dynamic alloca. This simply
    364   // disables inlining altogether in the presence of a dynamic alloca.
    365   HasDynamicAlloca = true;
    366   return false;
    367 }
    368 
    369 bool CallAnalyzer::visitPHI(PHINode &I) {
    370   // FIXME: We should potentially be tracking values through phi nodes,
    371   // especially when they collapse to a single value due to deleted CFG edges
    372   // during inlining.
    373 
    374   // FIXME: We need to propagate SROA *disabling* through phi nodes, even
    375   // though we don't want to propagate it's bonuses. The idea is to disable
    376   // SROA if it *might* be used in an inappropriate manner.
    377 
    378   // Phi nodes are always zero-cost.
    379   return true;
    380 }
    381 
    382 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
    383   Value *SROAArg;
    384   DenseMap<Value *, int>::iterator CostIt;
    385   bool SROACandidate =
    386       lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
    387 
    388   // Try to fold GEPs of constant-offset call site argument pointers. This
    389   // requires target data and inbounds GEPs.
    390   if (I.isInBounds()) {
    391     // Check if we have a base + offset for the pointer.
    392     Value *Ptr = I.getPointerOperand();
    393     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr);
    394     if (BaseAndOffset.first) {
    395       // Check if the offset of this GEP is constant, and if so accumulate it
    396       // into Offset.
    397       if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) {
    398         // Non-constant GEPs aren't folded, and disable SROA.
    399         if (SROACandidate)
    400           disableSROA(CostIt);
    401         return false;
    402       }
    403 
    404       // Add the result as a new mapping to Base + Offset.
    405       ConstantOffsetPtrs[&I] = BaseAndOffset;
    406 
    407       // Also handle SROA candidates here, we already know that the GEP is
    408       // all-constant indexed.
    409       if (SROACandidate)
    410         SROAArgValues[&I] = SROAArg;
    411 
    412       return true;
    413     }
    414   }
    415 
    416   if (isGEPOffsetConstant(I)) {
    417     if (SROACandidate)
    418       SROAArgValues[&I] = SROAArg;
    419 
    420     // Constant GEPs are modeled as free.
    421     return true;
    422   }
    423 
    424   // Variable GEPs will require math and will disable SROA.
    425   if (SROACandidate)
    426     disableSROA(CostIt);
    427   return false;
    428 }
    429 
    430 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
    431   // Propagate constants through bitcasts.
    432   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
    433   if (!COp)
    434     COp = SimplifiedValues.lookup(I.getOperand(0));
    435   if (COp)
    436     if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) {
    437       SimplifiedValues[&I] = C;
    438       return true;
    439     }
    440 
    441   // Track base/offsets through casts
    442   std::pair<Value *, APInt> BaseAndOffset =
    443       ConstantOffsetPtrs.lookup(I.getOperand(0));
    444   // Casts don't change the offset, just wrap it up.
    445   if (BaseAndOffset.first)
    446     ConstantOffsetPtrs[&I] = BaseAndOffset;
    447 
    448   // Also look for SROA candidates here.
    449   Value *SROAArg;
    450   DenseMap<Value *, int>::iterator CostIt;
    451   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
    452     SROAArgValues[&I] = SROAArg;
    453 
    454   // Bitcasts are always zero cost.
    455   return true;
    456 }
    457 
    458 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
    459   // Propagate constants through ptrtoint.
    460   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
    461   if (!COp)
    462     COp = SimplifiedValues.lookup(I.getOperand(0));
    463   if (COp)
    464     if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) {
    465       SimplifiedValues[&I] = C;
    466       return true;
    467     }
    468 
    469   // Track base/offset pairs when converted to a plain integer provided the
    470   // integer is large enough to represent the pointer.
    471   unsigned IntegerSize = I.getType()->getScalarSizeInBits();
    472   const DataLayout &DL = F.getParent()->getDataLayout();
    473   if (IntegerSize >= DL.getPointerSizeInBits()) {
    474     std::pair<Value *, APInt> BaseAndOffset =
    475         ConstantOffsetPtrs.lookup(I.getOperand(0));
    476     if (BaseAndOffset.first)
    477       ConstantOffsetPtrs[&I] = BaseAndOffset;
    478   }
    479 
    480   // This is really weird. Technically, ptrtoint will disable SROA. However,
    481   // unless that ptrtoint is *used* somewhere in the live basic blocks after
    482   // inlining, it will be nuked, and SROA should proceed. All of the uses which
    483   // would block SROA would also block SROA if applied directly to a pointer,
    484   // and so we can just add the integer in here. The only places where SROA is
    485   // preserved either cannot fire on an integer, or won't in-and-of themselves
    486   // disable SROA (ext) w/o some later use that we would see and disable.
    487   Value *SROAArg;
    488   DenseMap<Value *, int>::iterator CostIt;
    489   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
    490     SROAArgValues[&I] = SROAArg;
    491 
    492   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
    493 }
    494 
    495 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
    496   // Propagate constants through ptrtoint.
    497   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
    498   if (!COp)
    499     COp = SimplifiedValues.lookup(I.getOperand(0));
    500   if (COp)
    501     if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) {
    502       SimplifiedValues[&I] = C;
    503       return true;
    504     }
    505 
    506   // Track base/offset pairs when round-tripped through a pointer without
    507   // modifications provided the integer is not too large.
    508   Value *Op = I.getOperand(0);
    509   unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
    510   const DataLayout &DL = F.getParent()->getDataLayout();
    511   if (IntegerSize <= DL.getPointerSizeInBits()) {
    512     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
    513     if (BaseAndOffset.first)
    514       ConstantOffsetPtrs[&I] = BaseAndOffset;
    515   }
    516 
    517   // "Propagate" SROA here in the same manner as we do for ptrtoint above.
    518   Value *SROAArg;
    519   DenseMap<Value *, int>::iterator CostIt;
    520   if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
    521     SROAArgValues[&I] = SROAArg;
    522 
    523   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
    524 }
    525 
    526 bool CallAnalyzer::visitCastInst(CastInst &I) {
    527   // Propagate constants through ptrtoint.
    528   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
    529   if (!COp)
    530     COp = SimplifiedValues.lookup(I.getOperand(0));
    531   if (COp)
    532     if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) {
    533       SimplifiedValues[&I] = C;
    534       return true;
    535     }
    536 
    537   // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
    538   disableSROA(I.getOperand(0));
    539 
    540   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
    541 }
    542 
    543 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
    544   Value *Operand = I.getOperand(0);
    545   Constant *COp = dyn_cast<Constant>(Operand);
    546   if (!COp)
    547     COp = SimplifiedValues.lookup(Operand);
    548   if (COp) {
    549     const DataLayout &DL = F.getParent()->getDataLayout();
    550     if (Constant *C = ConstantFoldInstOperands(&I, COp, DL)) {
    551       SimplifiedValues[&I] = C;
    552       return true;
    553     }
    554   }
    555 
    556   // Disable any SROA on the argument to arbitrary unary operators.
    557   disableSROA(Operand);
    558 
    559   return false;
    560 }
    561 
    562 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
    563   unsigned ArgNo = A->getArgNo();
    564   return CandidateCS.paramHasAttr(ArgNo + 1, Attr);
    565 }
    566 
    567 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
    568   // Does the *call site* have the NonNull attribute set on an argument?  We
    569   // use the attribute on the call site to memoize any analysis done in the
    570   // caller. This will also trip if the callee function has a non-null
    571   // parameter attribute, but that's a less interesting case because hopefully
    572   // the callee would already have been simplified based on that.
    573   if (Argument *A = dyn_cast<Argument>(V))
    574     if (paramHasAttr(A, Attribute::NonNull))
    575       return true;
    576 
    577   // Is this an alloca in the caller?  This is distinct from the attribute case
    578   // above because attributes aren't updated within the inliner itself and we
    579   // always want to catch the alloca derived case.
    580   if (isAllocaDerivedArg(V))
    581     // We can actually predict the result of comparisons between an
    582     // alloca-derived value and null. Note that this fires regardless of
    583     // SROA firing.
    584     return true;
    585 
    586   return false;
    587 }
    588 
    589 bool CallAnalyzer::allowSizeGrowth(CallSite CS) {
    590   // If the normal destination of the invoke or the parent block of the call
    591   // site is unreachable-terminated, there is little point in inlining this
    592   // unless there is literally zero cost.
    593   // FIXME: Note that it is possible that an unreachable-terminated block has a
    594   // hot entry. For example, in below scenario inlining hot_call_X() may be
    595   // beneficial :
    596   // main() {
    597   //   hot_call_1();
    598   //   ...
    599   //   hot_call_N()
    600   //   exit(0);
    601   // }
    602   // For now, we are not handling this corner case here as it is rare in real
    603   // code. In future, we should elaborate this based on BPI and BFI in more
    604   // general threshold adjusting heuristics in updateThreshold().
    605   Instruction *Instr = CS.getInstruction();
    606   if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
    607     if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
    608       return false;
    609   } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator()))
    610     return false;
    611 
    612   return true;
    613 }
    614 
    615 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
    616   // If no size growth is allowed for this inlining, set Threshold to 0.
    617   if (!allowSizeGrowth(CS)) {
    618     Threshold = 0;
    619     return;
    620   }
    621 
    622   Function *Caller = CS.getCaller();
    623   if (DefaultInlineThreshold.getNumOccurrences() > 0) {
    624     // Explicitly specified -inline-threhold overrides the threshold passed to
    625     // CallAnalyzer's constructor.
    626     Threshold = DefaultInlineThreshold;
    627   } else {
    628     // If -inline-threshold is not given, listen to the optsize and minsize
    629     // attributes when they would decrease the threshold.
    630     if (Caller->optForMinSize() && OptMinSizeThreshold < Threshold)
    631       Threshold = OptMinSizeThreshold;
    632     else if (Caller->optForSize() && OptSizeThreshold < Threshold)
    633       Threshold = OptSizeThreshold;
    634   }
    635 
    636   bool HotCallsite = false;
    637   uint64_t TotalWeight;
    638   if (CS.getInstruction()->extractProfTotalWeight(TotalWeight) &&
    639       PSI->isHotCount(TotalWeight))
    640     HotCallsite = true;
    641 
    642   // Listen to the inlinehint attribute or profile based hotness information
    643   // when it would increase the threshold and the caller does not need to
    644   // minimize its size.
    645   bool InlineHint = Callee.hasFnAttribute(Attribute::InlineHint) ||
    646                     PSI->isHotFunction(&Callee) ||
    647                     HotCallsite;
    648   if (InlineHint && HintThreshold > Threshold && !Caller->optForMinSize())
    649     Threshold = HintThreshold;
    650 
    651   bool ColdCallee = PSI->isColdFunction(&Callee);
    652   // Command line argument for DefaultInlineThreshold will override the default
    653   // ColdThreshold. If we have -inline-threshold but no -inlinecold-threshold,
    654   // do not use the default cold threshold even if it is smaller.
    655   if ((DefaultInlineThreshold.getNumOccurrences() == 0 ||
    656        ColdThreshold.getNumOccurrences() > 0) &&
    657       ColdCallee && ColdThreshold < Threshold)
    658     Threshold = ColdThreshold;
    659 
    660   // Finally, take the target-specific inlining threshold multiplier into
    661   // account.
    662   Threshold *= TTI.getInliningThresholdMultiplier();
    663 }
    664 
    665 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
    666   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
    667   // First try to handle simplified comparisons.
    668   if (!isa<Constant>(LHS))
    669     if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
    670       LHS = SimpleLHS;
    671   if (!isa<Constant>(RHS))
    672     if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
    673       RHS = SimpleRHS;
    674   if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
    675     if (Constant *CRHS = dyn_cast<Constant>(RHS))
    676       if (Constant *C =
    677               ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
    678         SimplifiedValues[&I] = C;
    679         return true;
    680       }
    681   }
    682 
    683   if (I.getOpcode() == Instruction::FCmp)
    684     return false;
    685 
    686   // Otherwise look for a comparison between constant offset pointers with
    687   // a common base.
    688   Value *LHSBase, *RHSBase;
    689   APInt LHSOffset, RHSOffset;
    690   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
    691   if (LHSBase) {
    692     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
    693     if (RHSBase && LHSBase == RHSBase) {
    694       // We have common bases, fold the icmp to a constant based on the
    695       // offsets.
    696       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
    697       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
    698       if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
    699         SimplifiedValues[&I] = C;
    700         ++NumConstantPtrCmps;
    701         return true;
    702       }
    703     }
    704   }
    705 
    706   // If the comparison is an equality comparison with null, we can simplify it
    707   // if we know the value (argument) can't be null
    708   if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
    709       isKnownNonNullInCallee(I.getOperand(0))) {
    710     bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
    711     SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
    712                                       : ConstantInt::getFalse(I.getType());
    713     return true;
    714   }
    715   // Finally check for SROA candidates in comparisons.
    716   Value *SROAArg;
    717   DenseMap<Value *, int>::iterator CostIt;
    718   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
    719     if (isa<ConstantPointerNull>(I.getOperand(1))) {
    720       accumulateSROACost(CostIt, InlineConstants::InstrCost);
    721       return true;
    722     }
    723 
    724     disableSROA(CostIt);
    725   }
    726 
    727   return false;
    728 }
    729 
    730 bool CallAnalyzer::visitSub(BinaryOperator &I) {
    731   // Try to handle a special case: we can fold computing the difference of two
    732   // constant-related pointers.
    733   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
    734   Value *LHSBase, *RHSBase;
    735   APInt LHSOffset, RHSOffset;
    736   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
    737   if (LHSBase) {
    738     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
    739     if (RHSBase && LHSBase == RHSBase) {
    740       // We have common bases, fold the subtract to a constant based on the
    741       // offsets.
    742       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
    743       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
    744       if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
    745         SimplifiedValues[&I] = C;
    746         ++NumConstantPtrDiffs;
    747         return true;
    748       }
    749     }
    750   }
    751 
    752   // Otherwise, fall back to the generic logic for simplifying and handling
    753   // instructions.
    754   return Base::visitSub(I);
    755 }
    756 
    757 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
    758   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
    759   const DataLayout &DL = F.getParent()->getDataLayout();
    760   if (!isa<Constant>(LHS))
    761     if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
    762       LHS = SimpleLHS;
    763   if (!isa<Constant>(RHS))
    764     if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
    765       RHS = SimpleRHS;
    766   Value *SimpleV = nullptr;
    767   if (auto FI = dyn_cast<FPMathOperator>(&I))
    768     SimpleV =
    769         SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
    770   else
    771     SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
    772 
    773   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
    774     SimplifiedValues[&I] = C;
    775     return true;
    776   }
    777 
    778   // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
    779   disableSROA(LHS);
    780   disableSROA(RHS);
    781 
    782   return false;
    783 }
    784 
    785 bool CallAnalyzer::visitLoad(LoadInst &I) {
    786   Value *SROAArg;
    787   DenseMap<Value *, int>::iterator CostIt;
    788   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
    789     if (I.isSimple()) {
    790       accumulateSROACost(CostIt, InlineConstants::InstrCost);
    791       return true;
    792     }
    793 
    794     disableSROA(CostIt);
    795   }
    796 
    797   return false;
    798 }
    799 
    800 bool CallAnalyzer::visitStore(StoreInst &I) {
    801   Value *SROAArg;
    802   DenseMap<Value *, int>::iterator CostIt;
    803   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
    804     if (I.isSimple()) {
    805       accumulateSROACost(CostIt, InlineConstants::InstrCost);
    806       return true;
    807     }
    808 
    809     disableSROA(CostIt);
    810   }
    811 
    812   return false;
    813 }
    814 
    815 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
    816   // Constant folding for extract value is trivial.
    817   Constant *C = dyn_cast<Constant>(I.getAggregateOperand());
    818   if (!C)
    819     C = SimplifiedValues.lookup(I.getAggregateOperand());
    820   if (C) {
    821     SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices());
    822     return true;
    823   }
    824 
    825   // SROA can look through these but give them a cost.
    826   return false;
    827 }
    828 
    829 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
    830   // Constant folding for insert value is trivial.
    831   Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand());
    832   if (!AggC)
    833     AggC = SimplifiedValues.lookup(I.getAggregateOperand());
    834   Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand());
    835   if (!InsertedC)
    836     InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand());
    837   if (AggC && InsertedC) {
    838     SimplifiedValues[&I] =
    839         ConstantExpr::getInsertValue(AggC, InsertedC, I.getIndices());
    840     return true;
    841   }
    842 
    843   // SROA can look through these but give them a cost.
    844   return false;
    845 }
    846 
    847 /// \brief Try to simplify a call site.
    848 ///
    849 /// Takes a concrete function and callsite and tries to actually simplify it by
    850 /// analyzing the arguments and call itself with instsimplify. Returns true if
    851 /// it has simplified the callsite to some other entity (a constant), making it
    852 /// free.
    853 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
    854   // FIXME: Using the instsimplify logic directly for this is inefficient
    855   // because we have to continually rebuild the argument list even when no
    856   // simplifications can be performed. Until that is fixed with remapping
    857   // inside of instsimplify, directly constant fold calls here.
    858   if (!canConstantFoldCallTo(F))
    859     return false;
    860 
    861   // Try to re-map the arguments to constants.
    862   SmallVector<Constant *, 4> ConstantArgs;
    863   ConstantArgs.reserve(CS.arg_size());
    864   for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
    865        ++I) {
    866     Constant *C = dyn_cast<Constant>(*I);
    867     if (!C)
    868       C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
    869     if (!C)
    870       return false; // This argument doesn't map to a constant.
    871 
    872     ConstantArgs.push_back(C);
    873   }
    874   if (Constant *C = ConstantFoldCall(F, ConstantArgs)) {
    875     SimplifiedValues[CS.getInstruction()] = C;
    876     return true;
    877   }
    878 
    879   return false;
    880 }
    881 
    882 bool CallAnalyzer::visitCallSite(CallSite CS) {
    883   if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
    884       !F.hasFnAttribute(Attribute::ReturnsTwice)) {
    885     // This aborts the entire analysis.
    886     ExposesReturnsTwice = true;
    887     return false;
    888   }
    889   if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate())
    890     ContainsNoDuplicateCall = true;
    891 
    892   if (Function *F = CS.getCalledFunction()) {
    893     // When we have a concrete function, first try to simplify it directly.
    894     if (simplifyCallSite(F, CS))
    895       return true;
    896 
    897     // Next check if it is an intrinsic we know about.
    898     // FIXME: Lift this into part of the InstVisitor.
    899     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
    900       switch (II->getIntrinsicID()) {
    901       default:
    902         return Base::visitCallSite(CS);
    903 
    904       case Intrinsic::load_relative:
    905         // This is normally lowered to 4 LLVM instructions.
    906         Cost += 3 * InlineConstants::InstrCost;
    907         return false;
    908 
    909       case Intrinsic::memset:
    910       case Intrinsic::memcpy:
    911       case Intrinsic::memmove:
    912         // SROA can usually chew through these intrinsics, but they aren't free.
    913         return false;
    914       case Intrinsic::localescape:
    915         HasFrameEscape = true;
    916         return false;
    917       }
    918     }
    919 
    920     if (F == CS.getInstruction()->getParent()->getParent()) {
    921       // This flag will fully abort the analysis, so don't bother with anything
    922       // else.
    923       IsRecursiveCall = true;
    924       return false;
    925     }
    926 
    927     if (TTI.isLoweredToCall(F)) {
    928       // We account for the average 1 instruction per call argument setup
    929       // here.
    930       Cost += CS.arg_size() * InlineConstants::InstrCost;
    931 
    932       // Everything other than inline ASM will also have a significant cost
    933       // merely from making the call.
    934       if (!isa<InlineAsm>(CS.getCalledValue()))
    935         Cost += InlineConstants::CallPenalty;
    936     }
    937 
    938     return Base::visitCallSite(CS);
    939   }
    940 
    941   // Otherwise we're in a very special case -- an indirect function call. See
    942   // if we can be particularly clever about this.
    943   Value *Callee = CS.getCalledValue();
    944 
    945   // First, pay the price of the argument setup. We account for the average
    946   // 1 instruction per call argument setup here.
    947   Cost += CS.arg_size() * InlineConstants::InstrCost;
    948 
    949   // Next, check if this happens to be an indirect function call to a known
    950   // function in this inline context. If not, we've done all we can.
    951   Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
    952   if (!F)
    953     return Base::visitCallSite(CS);
    954 
    955   // If we have a constant that we are calling as a function, we can peer
    956   // through it and see the function target. This happens not infrequently
    957   // during devirtualization and so we want to give it a hefty bonus for
    958   // inlining, but cap that bonus in the event that inlining wouldn't pan
    959   // out. Pretend to inline the function, with a custom threshold.
    960   CallAnalyzer CA(TTI, ACT, PSI, *F, InlineConstants::IndirectCallThreshold,
    961                   CS);
    962   if (CA.analyzeCall(CS)) {
    963     // We were able to inline the indirect call! Subtract the cost from the
    964     // threshold to get the bonus we want to apply, but don't go below zero.
    965     Cost -= std::max(0, CA.getThreshold() - CA.getCost());
    966   }
    967 
    968   return Base::visitCallSite(CS);
    969 }
    970 
    971 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
    972   // At least one return instruction will be free after inlining.
    973   bool Free = !HasReturn;
    974   HasReturn = true;
    975   return Free;
    976 }
    977 
    978 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
    979   // We model unconditional branches as essentially free -- they really
    980   // shouldn't exist at all, but handling them makes the behavior of the
    981   // inliner more regular and predictable. Interestingly, conditional branches
    982   // which will fold away are also free.
    983   return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
    984          dyn_cast_or_null<ConstantInt>(
    985              SimplifiedValues.lookup(BI.getCondition()));
    986 }
    987 
    988 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
    989   // We model unconditional switches as free, see the comments on handling
    990   // branches.
    991   if (isa<ConstantInt>(SI.getCondition()))
    992     return true;
    993   if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
    994     if (isa<ConstantInt>(V))
    995       return true;
    996 
    997   // Otherwise, we need to accumulate a cost proportional to the number of
    998   // distinct successor blocks. This fan-out in the CFG cannot be represented
    999   // for free even if we can represent the core switch as a jumptable that
   1000   // takes a single instruction.
   1001   //
   1002   // NB: We convert large switches which are just used to initialize large phi
   1003   // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
   1004   // inlining those. It will prevent inlining in cases where the optimization
   1005   // does not (yet) fire.
   1006   SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
   1007   SuccessorBlocks.insert(SI.getDefaultDest());
   1008   for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I)
   1009     SuccessorBlocks.insert(I.getCaseSuccessor());
   1010   // Add cost corresponding to the number of distinct destinations. The first
   1011   // we model as free because of fallthrough.
   1012   Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
   1013   return false;
   1014 }
   1015 
   1016 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
   1017   // We never want to inline functions that contain an indirectbr.  This is
   1018   // incorrect because all the blockaddress's (in static global initializers
   1019   // for example) would be referring to the original function, and this
   1020   // indirect jump would jump from the inlined copy of the function into the
   1021   // original function which is extremely undefined behavior.
   1022   // FIXME: This logic isn't really right; we can safely inline functions with
   1023   // indirectbr's as long as no other function or global references the
   1024   // blockaddress of a block within the current function.
   1025   HasIndirectBr = true;
   1026   return false;
   1027 }
   1028 
   1029 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
   1030   // FIXME: It's not clear that a single instruction is an accurate model for
   1031   // the inline cost of a resume instruction.
   1032   return false;
   1033 }
   1034 
   1035 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
   1036   // FIXME: It's not clear that a single instruction is an accurate model for
   1037   // the inline cost of a cleanupret instruction.
   1038   return false;
   1039 }
   1040 
   1041 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
   1042   // FIXME: It's not clear that a single instruction is an accurate model for
   1043   // the inline cost of a catchret instruction.
   1044   return false;
   1045 }
   1046 
   1047 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
   1048   // FIXME: It might be reasonably to discount the cost of instructions leading
   1049   // to unreachable as they have the lowest possible impact on both runtime and
   1050   // code size.
   1051   return true; // No actual code is needed for unreachable.
   1052 }
   1053 
   1054 bool CallAnalyzer::visitInstruction(Instruction &I) {
   1055   // Some instructions are free. All of the free intrinsics can also be
   1056   // handled by SROA, etc.
   1057   if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
   1058     return true;
   1059 
   1060   // We found something we don't understand or can't handle. Mark any SROA-able
   1061   // values in the operand list as no longer viable.
   1062   for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
   1063     disableSROA(*OI);
   1064 
   1065   return false;
   1066 }
   1067 
   1068 /// \brief Analyze a basic block for its contribution to the inline cost.
   1069 ///
   1070 /// This method walks the analyzer over every instruction in the given basic
   1071 /// block and accounts for their cost during inlining at this callsite. It
   1072 /// aborts early if the threshold has been exceeded or an impossible to inline
   1073 /// construct has been detected. It returns false if inlining is no longer
   1074 /// viable, and true if inlining remains viable.
   1075 bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
   1076                                 SmallPtrSetImpl<const Value *> &EphValues) {
   1077   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
   1078     // FIXME: Currently, the number of instructions in a function regardless of
   1079     // our ability to simplify them during inline to constants or dead code,
   1080     // are actually used by the vector bonus heuristic. As long as that's true,
   1081     // we have to special case debug intrinsics here to prevent differences in
   1082     // inlining due to debug symbols. Eventually, the number of unsimplified
   1083     // instructions shouldn't factor into the cost computation, but until then,
   1084     // hack around it here.
   1085     if (isa<DbgInfoIntrinsic>(I))
   1086       continue;
   1087 
   1088     // Skip ephemeral values.
   1089     if (EphValues.count(&*I))
   1090       continue;
   1091 
   1092     ++NumInstructions;
   1093     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
   1094       ++NumVectorInstructions;
   1095 
   1096     // If the instruction is floating point, and the target says this operation
   1097     // is expensive or the function has the "use-soft-float" attribute, this may
   1098     // eventually become a library call. Treat the cost as such.
   1099     if (I->getType()->isFloatingPointTy()) {
   1100       bool hasSoftFloatAttr = false;
   1101 
   1102       // If the function has the "use-soft-float" attribute, mark it as
   1103       // expensive.
   1104       if (F.hasFnAttribute("use-soft-float")) {
   1105         Attribute Attr = F.getFnAttribute("use-soft-float");
   1106         StringRef Val = Attr.getValueAsString();
   1107         if (Val == "true")
   1108           hasSoftFloatAttr = true;
   1109       }
   1110 
   1111       if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive ||
   1112           hasSoftFloatAttr)
   1113         Cost += InlineConstants::CallPenalty;
   1114     }
   1115 
   1116     // If the instruction simplified to a constant, there is no cost to this
   1117     // instruction. Visit the instructions using our InstVisitor to account for
   1118     // all of the per-instruction logic. The visit tree returns true if we
   1119     // consumed the instruction in any way, and false if the instruction's base
   1120     // cost should count against inlining.
   1121     if (Base::visit(&*I))
   1122       ++NumInstructionsSimplified;
   1123     else
   1124       Cost += InlineConstants::InstrCost;
   1125 
   1126     // If the visit this instruction detected an uninlinable pattern, abort.
   1127     if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
   1128         HasIndirectBr || HasFrameEscape)
   1129       return false;
   1130 
   1131     // If the caller is a recursive function then we don't want to inline
   1132     // functions which allocate a lot of stack space because it would increase
   1133     // the caller stack usage dramatically.
   1134     if (IsCallerRecursive &&
   1135         AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
   1136       return false;
   1137 
   1138     // Check if we've past the maximum possible threshold so we don't spin in
   1139     // huge basic blocks that will never inline.
   1140     if (Cost > Threshold)
   1141       return false;
   1142   }
   1143 
   1144   return true;
   1145 }
   1146 
   1147 /// \brief Compute the base pointer and cumulative constant offsets for V.
   1148 ///
   1149 /// This strips all constant offsets off of V, leaving it the base pointer, and
   1150 /// accumulates the total constant offset applied in the returned constant. It
   1151 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
   1152 /// no constant offsets applied.
   1153 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
   1154   if (!V->getType()->isPointerTy())
   1155     return nullptr;
   1156 
   1157   const DataLayout &DL = F.getParent()->getDataLayout();
   1158   unsigned IntPtrWidth = DL.getPointerSizeInBits();
   1159   APInt Offset = APInt::getNullValue(IntPtrWidth);
   1160 
   1161   // Even though we don't look through PHI nodes, we could be called on an
   1162   // instruction in an unreachable block, which may be on a cycle.
   1163   SmallPtrSet<Value *, 4> Visited;
   1164   Visited.insert(V);
   1165   do {
   1166     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
   1167       if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
   1168         return nullptr;
   1169       V = GEP->getPointerOperand();
   1170     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
   1171       V = cast<Operator>(V)->getOperand(0);
   1172     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
   1173       if (GA->isInterposable())
   1174         break;
   1175       V = GA->getAliasee();
   1176     } else {
   1177       break;
   1178     }
   1179     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
   1180   } while (Visited.insert(V).second);
   1181 
   1182   Type *IntPtrTy = DL.getIntPtrType(V->getContext());
   1183   return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
   1184 }
   1185 
   1186 /// \brief Analyze a call site for potential inlining.
   1187 ///
   1188 /// Returns true if inlining this call is viable, and false if it is not
   1189 /// viable. It computes the cost and adjusts the threshold based on numerous
   1190 /// factors and heuristics. If this method returns false but the computed cost
   1191 /// is below the computed threshold, then inlining was forcibly disabled by
   1192 /// some artifact of the routine.
   1193 bool CallAnalyzer::analyzeCall(CallSite CS) {
   1194   ++NumCallsAnalyzed;
   1195 
   1196   // Perform some tweaks to the cost and threshold based on the direct
   1197   // callsite information.
   1198 
   1199   // We want to more aggressively inline vector-dense kernels, so up the
   1200   // threshold, and we'll lower it if the % of vector instructions gets too
   1201   // low. Note that these bonuses are some what arbitrary and evolved over time
   1202   // by accident as much as because they are principled bonuses.
   1203   //
   1204   // FIXME: It would be nice to remove all such bonuses. At least it would be
   1205   // nice to base the bonus values on something more scientific.
   1206   assert(NumInstructions == 0);
   1207   assert(NumVectorInstructions == 0);
   1208 
   1209   // Update the threshold based on callsite properties
   1210   updateThreshold(CS, F);
   1211 
   1212   FiftyPercentVectorBonus = 3 * Threshold / 2;
   1213   TenPercentVectorBonus = 3 * Threshold / 4;
   1214   const DataLayout &DL = F.getParent()->getDataLayout();
   1215 
   1216   // Track whether the post-inlining function would have more than one basic
   1217   // block. A single basic block is often intended for inlining. Balloon the
   1218   // threshold by 50% until we pass the single-BB phase.
   1219   bool SingleBB = true;
   1220   int SingleBBBonus = Threshold / 2;
   1221 
   1222   // Speculatively apply all possible bonuses to Threshold. If cost exceeds
   1223   // this Threshold any time, and cost cannot decrease, we can stop processing
   1224   // the rest of the function body.
   1225   Threshold += (SingleBBBonus + FiftyPercentVectorBonus);
   1226 
   1227   // Give out bonuses per argument, as the instructions setting them up will
   1228   // be gone after inlining.
   1229   for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
   1230     if (CS.isByValArgument(I)) {
   1231       // We approximate the number of loads and stores needed by dividing the
   1232       // size of the byval type by the target's pointer size.
   1233       PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
   1234       unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
   1235       unsigned PointerSize = DL.getPointerSizeInBits();
   1236       // Ceiling division.
   1237       unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
   1238 
   1239       // If it generates more than 8 stores it is likely to be expanded as an
   1240       // inline memcpy so we take that as an upper bound. Otherwise we assume
   1241       // one load and one store per word copied.
   1242       // FIXME: The maxStoresPerMemcpy setting from the target should be used
   1243       // here instead of a magic number of 8, but it's not available via
   1244       // DataLayout.
   1245       NumStores = std::min(NumStores, 8U);
   1246 
   1247       Cost -= 2 * NumStores * InlineConstants::InstrCost;
   1248     } else {
   1249       // For non-byval arguments subtract off one instruction per call
   1250       // argument.
   1251       Cost -= InlineConstants::InstrCost;
   1252     }
   1253   }
   1254 
   1255   // If there is only one call of the function, and it has internal linkage,
   1256   // the cost of inlining it drops dramatically.
   1257   bool OnlyOneCallAndLocalLinkage =
   1258       F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
   1259   if (OnlyOneCallAndLocalLinkage)
   1260     Cost += InlineConstants::LastCallToStaticBonus;
   1261 
   1262   // If this function uses the coldcc calling convention, prefer not to inline
   1263   // it.
   1264   if (F.getCallingConv() == CallingConv::Cold)
   1265     Cost += InlineConstants::ColdccPenalty;
   1266 
   1267   // Check if we're done. This can happen due to bonuses and penalties.
   1268   if (Cost > Threshold)
   1269     return false;
   1270 
   1271   if (F.empty())
   1272     return true;
   1273 
   1274   Function *Caller = CS.getInstruction()->getParent()->getParent();
   1275   // Check if the caller function is recursive itself.
   1276   for (User *U : Caller->users()) {
   1277     CallSite Site(U);
   1278     if (!Site)
   1279       continue;
   1280     Instruction *I = Site.getInstruction();
   1281     if (I->getParent()->getParent() == Caller) {
   1282       IsCallerRecursive = true;
   1283       break;
   1284     }
   1285   }
   1286 
   1287   // Populate our simplified values by mapping from function arguments to call
   1288   // arguments with known important simplifications.
   1289   CallSite::arg_iterator CAI = CS.arg_begin();
   1290   for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
   1291        FAI != FAE; ++FAI, ++CAI) {
   1292     assert(CAI != CS.arg_end());
   1293     if (Constant *C = dyn_cast<Constant>(CAI))
   1294       SimplifiedValues[&*FAI] = C;
   1295 
   1296     Value *PtrArg = *CAI;
   1297     if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
   1298       ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
   1299 
   1300       // We can SROA any pointer arguments derived from alloca instructions.
   1301       if (isa<AllocaInst>(PtrArg)) {
   1302         SROAArgValues[&*FAI] = PtrArg;
   1303         SROAArgCosts[PtrArg] = 0;
   1304       }
   1305     }
   1306   }
   1307   NumConstantArgs = SimplifiedValues.size();
   1308   NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
   1309   NumAllocaArgs = SROAArgValues.size();
   1310 
   1311   // FIXME: If a caller has multiple calls to a callee, we end up recomputing
   1312   // the ephemeral values multiple times (and they're completely determined by
   1313   // the callee, so this is purely duplicate work).
   1314   SmallPtrSet<const Value *, 32> EphValues;
   1315   CodeMetrics::collectEphemeralValues(&F, &ACT->getAssumptionCache(F),
   1316                                       EphValues);
   1317 
   1318   // The worklist of live basic blocks in the callee *after* inlining. We avoid
   1319   // adding basic blocks of the callee which can be proven to be dead for this
   1320   // particular call site in order to get more accurate cost estimates. This
   1321   // requires a somewhat heavyweight iteration pattern: we need to walk the
   1322   // basic blocks in a breadth-first order as we insert live successors. To
   1323   // accomplish this, prioritizing for small iterations because we exit after
   1324   // crossing our threshold, we use a small-size optimized SetVector.
   1325   typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
   1326                     SmallPtrSet<BasicBlock *, 16>>
   1327       BBSetVector;
   1328   BBSetVector BBWorklist;
   1329   BBWorklist.insert(&F.getEntryBlock());
   1330   // Note that we *must not* cache the size, this loop grows the worklist.
   1331   for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
   1332     // Bail out the moment we cross the threshold. This means we'll under-count
   1333     // the cost, but only when undercounting doesn't matter.
   1334     if (Cost > Threshold)
   1335       break;
   1336 
   1337     BasicBlock *BB = BBWorklist[Idx];
   1338     if (BB->empty())
   1339       continue;
   1340 
   1341     // Disallow inlining a blockaddress. A blockaddress only has defined
   1342     // behavior for an indirect branch in the same function, and we do not
   1343     // currently support inlining indirect branches. But, the inliner may not
   1344     // see an indirect branch that ends up being dead code at a particular call
   1345     // site. If the blockaddress escapes the function, e.g., via a global
   1346     // variable, inlining may lead to an invalid cross-function reference.
   1347     if (BB->hasAddressTaken())
   1348       return false;
   1349 
   1350     // Analyze the cost of this block. If we blow through the threshold, this
   1351     // returns false, and we can bail on out.
   1352     if (!analyzeBlock(BB, EphValues))
   1353       return false;
   1354 
   1355     TerminatorInst *TI = BB->getTerminator();
   1356 
   1357     // Add in the live successors by first checking whether we have terminator
   1358     // that may be simplified based on the values simplified by this call.
   1359     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
   1360       if (BI->isConditional()) {
   1361         Value *Cond = BI->getCondition();
   1362         if (ConstantInt *SimpleCond =
   1363                 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
   1364           BBWorklist.insert(BI->getSuccessor(SimpleCond->isZero() ? 1 : 0));
   1365           continue;
   1366         }
   1367       }
   1368     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
   1369       Value *Cond = SI->getCondition();
   1370       if (ConstantInt *SimpleCond =
   1371               dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
   1372         BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor());
   1373         continue;
   1374       }
   1375     }
   1376 
   1377     // If we're unable to select a particular successor, just count all of
   1378     // them.
   1379     for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
   1380          ++TIdx)
   1381       BBWorklist.insert(TI->getSuccessor(TIdx));
   1382 
   1383     // If we had any successors at this point, than post-inlining is likely to
   1384     // have them as well. Note that we assume any basic blocks which existed
   1385     // due to branches or switches which folded above will also fold after
   1386     // inlining.
   1387     if (SingleBB && TI->getNumSuccessors() > 1) {
   1388       // Take off the bonus we applied to the threshold.
   1389       Threshold -= SingleBBBonus;
   1390       SingleBB = false;
   1391     }
   1392   }
   1393 
   1394   // If this is a noduplicate call, we can still inline as long as
   1395   // inlining this would cause the removal of the caller (so the instruction
   1396   // is not actually duplicated, just moved).
   1397   if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
   1398     return false;
   1399 
   1400   // We applied the maximum possible vector bonus at the beginning. Now,
   1401   // subtract the excess bonus, if any, from the Threshold before
   1402   // comparing against Cost.
   1403   if (NumVectorInstructions <= NumInstructions / 10)
   1404     Threshold -= FiftyPercentVectorBonus;
   1405   else if (NumVectorInstructions <= NumInstructions / 2)
   1406     Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus);
   1407 
   1408   return Cost < std::max(1, Threshold);
   1409 }
   1410 
   1411 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   1412 /// \brief Dump stats about this call's analysis.
   1413 LLVM_DUMP_METHOD void CallAnalyzer::dump() {
   1414 #define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
   1415   DEBUG_PRINT_STAT(NumConstantArgs);
   1416   DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
   1417   DEBUG_PRINT_STAT(NumAllocaArgs);
   1418   DEBUG_PRINT_STAT(NumConstantPtrCmps);
   1419   DEBUG_PRINT_STAT(NumConstantPtrDiffs);
   1420   DEBUG_PRINT_STAT(NumInstructionsSimplified);
   1421   DEBUG_PRINT_STAT(NumInstructions);
   1422   DEBUG_PRINT_STAT(SROACostSavings);
   1423   DEBUG_PRINT_STAT(SROACostSavingsLost);
   1424   DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
   1425   DEBUG_PRINT_STAT(Cost);
   1426   DEBUG_PRINT_STAT(Threshold);
   1427 #undef DEBUG_PRINT_STAT
   1428 }
   1429 #endif
   1430 
   1431 /// \brief Test that two functions either have or have not the given attribute
   1432 ///        at the same time.
   1433 template <typename AttrKind>
   1434 static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) {
   1435   return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr);
   1436 }
   1437 
   1438 /// \brief Test that there are no attribute conflicts between Caller and Callee
   1439 ///        that prevent inlining.
   1440 static bool functionsHaveCompatibleAttributes(Function *Caller,
   1441                                               Function *Callee,
   1442                                               TargetTransformInfo &TTI) {
   1443   return TTI.areInlineCompatible(Caller, Callee) &&
   1444          AttributeFuncs::areInlineCompatible(*Caller, *Callee);
   1445 }
   1446 
   1447 InlineCost llvm::getInlineCost(CallSite CS, int DefaultThreshold,
   1448                                TargetTransformInfo &CalleeTTI,
   1449                                AssumptionCacheTracker *ACT,
   1450                                ProfileSummaryInfo *PSI) {
   1451   return getInlineCost(CS, CS.getCalledFunction(), DefaultThreshold, CalleeTTI,
   1452                        ACT, PSI);
   1453 }
   1454 
   1455 int llvm::computeThresholdFromOptLevels(unsigned OptLevel,
   1456                                         unsigned SizeOptLevel) {
   1457   if (OptLevel > 2)
   1458     return OptAggressiveThreshold;
   1459   if (SizeOptLevel == 1) // -Os
   1460     return OptSizeThreshold;
   1461   if (SizeOptLevel == 2) // -Oz
   1462     return OptMinSizeThreshold;
   1463   return DefaultInlineThreshold;
   1464 }
   1465 
   1466 int llvm::getDefaultInlineThreshold() { return DefaultInlineThreshold; }
   1467 
   1468 InlineCost llvm::getInlineCost(CallSite CS, Function *Callee,
   1469                                int DefaultThreshold,
   1470                                TargetTransformInfo &CalleeTTI,
   1471                                AssumptionCacheTracker *ACT,
   1472                                ProfileSummaryInfo *PSI) {
   1473 
   1474   // Cannot inline indirect calls.
   1475   if (!Callee)
   1476     return llvm::InlineCost::getNever();
   1477 
   1478   // Calls to functions with always-inline attributes should be inlined
   1479   // whenever possible.
   1480   if (CS.hasFnAttr(Attribute::AlwaysInline)) {
   1481     if (isInlineViable(*Callee))
   1482       return llvm::InlineCost::getAlways();
   1483     return llvm::InlineCost::getNever();
   1484   }
   1485 
   1486   // Never inline functions with conflicting attributes (unless callee has
   1487   // always-inline attribute).
   1488   if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee, CalleeTTI))
   1489     return llvm::InlineCost::getNever();
   1490 
   1491   // Don't inline this call if the caller has the optnone attribute.
   1492   if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone))
   1493     return llvm::InlineCost::getNever();
   1494 
   1495   // Don't inline functions which can be interposed at link-time.  Don't inline
   1496   // functions marked noinline or call sites marked noinline.
   1497   // Note: inlining non-exact non-interposable fucntions is fine, since we know
   1498   // we have *a* correct implementation of the source level function.
   1499   if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) ||
   1500       CS.isNoInline())
   1501     return llvm::InlineCost::getNever();
   1502 
   1503   DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
   1504                      << "...\n");
   1505 
   1506   CallAnalyzer CA(CalleeTTI, ACT, PSI, *Callee, DefaultThreshold, CS);
   1507   bool ShouldInline = CA.analyzeCall(CS);
   1508 
   1509   DEBUG(CA.dump());
   1510 
   1511   // Check if there was a reason to force inlining or no inlining.
   1512   if (!ShouldInline && CA.getCost() < CA.getThreshold())
   1513     return InlineCost::getNever();
   1514   if (ShouldInline && CA.getCost() >= CA.getThreshold())
   1515     return InlineCost::getAlways();
   1516 
   1517   return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
   1518 }
   1519 
   1520 bool llvm::isInlineViable(Function &F) {
   1521   bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
   1522   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
   1523     // Disallow inlining of functions which contain indirect branches or
   1524     // blockaddresses.
   1525     if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
   1526       return false;
   1527 
   1528     for (auto &II : *BI) {
   1529       CallSite CS(&II);
   1530       if (!CS)
   1531         continue;
   1532 
   1533       // Disallow recursive calls.
   1534       if (&F == CS.getCalledFunction())
   1535         return false;
   1536 
   1537       // Disallow calls which expose returns-twice to a function not previously
   1538       // attributed as such.
   1539       if (!ReturnsTwice && CS.isCall() &&
   1540           cast<CallInst>(CS.getInstruction())->canReturnTwice())
   1541         return false;
   1542 
   1543       // Disallow inlining functions that call @llvm.localescape. Doing this
   1544       // correctly would require major changes to the inliner.
   1545       if (CS.getCalledFunction() &&
   1546           CS.getCalledFunction()->getIntrinsicID() ==
   1547               llvm::Intrinsic::localescape)
   1548         return false;
   1549     }
   1550   }
   1551 
   1552   return true;
   1553 }
   1554