1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements inline cost analysis. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Analysis/InlineCost.h" 15 #include "llvm/ADT/STLExtras.h" 16 #include "llvm/ADT/SetVector.h" 17 #include "llvm/ADT/SmallPtrSet.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/Analysis/AssumptionCache.h" 21 #include "llvm/Analysis/CodeMetrics.h" 22 #include "llvm/Analysis/ConstantFolding.h" 23 #include "llvm/Analysis/InstructionSimplify.h" 24 #include "llvm/Analysis/TargetTransformInfo.h" 25 #include "llvm/IR/CallSite.h" 26 #include "llvm/IR/CallingConv.h" 27 #include "llvm/IR/DataLayout.h" 28 #include "llvm/IR/GetElementPtrTypeIterator.h" 29 #include "llvm/IR/GlobalAlias.h" 30 #include "llvm/IR/InstVisitor.h" 31 #include "llvm/IR/IntrinsicInst.h" 32 #include "llvm/IR/Operator.h" 33 #include "llvm/Support/Debug.h" 34 #include "llvm/Support/raw_ostream.h" 35 36 using namespace llvm; 37 38 #define DEBUG_TYPE "inline-cost" 39 40 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed"); 41 42 namespace { 43 44 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { 45 typedef InstVisitor<CallAnalyzer, bool> Base; 46 friend class InstVisitor<CallAnalyzer, bool>; 47 48 /// The TargetTransformInfo available for this compilation. 49 const TargetTransformInfo &TTI; 50 51 /// The cache of @llvm.assume intrinsics. 52 AssumptionCacheTracker *ACT; 53 54 // The called function. 55 Function &F; 56 57 // The candidate callsite being analyzed. Please do not use this to do 58 // analysis in the caller function; we want the inline cost query to be 59 // easily cacheable. Instead, use the cover function paramHasAttr. 60 CallSite CandidateCS; 61 62 int Threshold; 63 int Cost; 64 65 bool IsCallerRecursive; 66 bool IsRecursiveCall; 67 bool ExposesReturnsTwice; 68 bool HasDynamicAlloca; 69 bool ContainsNoDuplicateCall; 70 bool HasReturn; 71 bool HasIndirectBr; 72 bool HasFrameEscape; 73 74 /// Number of bytes allocated statically by the callee. 75 uint64_t AllocatedSize; 76 unsigned NumInstructions, NumVectorInstructions; 77 int FiftyPercentVectorBonus, TenPercentVectorBonus; 78 int VectorBonus; 79 80 // While we walk the potentially-inlined instructions, we build up and 81 // maintain a mapping of simplified values specific to this callsite. The 82 // idea is to propagate any special information we have about arguments to 83 // this call through the inlinable section of the function, and account for 84 // likely simplifications post-inlining. The most important aspect we track 85 // is CFG altering simplifications -- when we prove a basic block dead, that 86 // can cause dramatic shifts in the cost of inlining a function. 87 DenseMap<Value *, Constant *> SimplifiedValues; 88 89 // Keep track of the values which map back (through function arguments) to 90 // allocas on the caller stack which could be simplified through SROA. 91 DenseMap<Value *, Value *> SROAArgValues; 92 93 // The mapping of caller Alloca values to their accumulated cost savings. If 94 // we have to disable SROA for one of the allocas, this tells us how much 95 // cost must be added. 96 DenseMap<Value *, int> SROAArgCosts; 97 98 // Keep track of values which map to a pointer base and constant offset. 99 DenseMap<Value *, std::pair<Value *, APInt> > ConstantOffsetPtrs; 100 101 // Custom simplification helper routines. 102 bool isAllocaDerivedArg(Value *V); 103 bool lookupSROAArgAndCost(Value *V, Value *&Arg, 104 DenseMap<Value *, int>::iterator &CostIt); 105 void disableSROA(DenseMap<Value *, int>::iterator CostIt); 106 void disableSROA(Value *V); 107 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 108 int InstructionCost); 109 bool isGEPOffsetConstant(GetElementPtrInst &GEP); 110 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); 111 bool simplifyCallSite(Function *F, CallSite CS); 112 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); 113 114 /// Return true if the given argument to the function being considered for 115 /// inlining has the given attribute set either at the call site or the 116 /// function declaration. Primarily used to inspect call site specific 117 /// attributes since these can be more precise than the ones on the callee 118 /// itself. 119 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); 120 121 /// Return true if the given value is known non null within the callee if 122 /// inlined through this particular callsite. 123 bool isKnownNonNullInCallee(Value *V); 124 125 // Custom analysis routines. 126 bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues); 127 128 // Disable several entry points to the visitor so we don't accidentally use 129 // them by declaring but not defining them here. 130 void visit(Module *); void visit(Module &); 131 void visit(Function *); void visit(Function &); 132 void visit(BasicBlock *); void visit(BasicBlock &); 133 134 // Provide base case for our instruction visit. 135 bool visitInstruction(Instruction &I); 136 137 // Our visit overrides. 138 bool visitAlloca(AllocaInst &I); 139 bool visitPHI(PHINode &I); 140 bool visitGetElementPtr(GetElementPtrInst &I); 141 bool visitBitCast(BitCastInst &I); 142 bool visitPtrToInt(PtrToIntInst &I); 143 bool visitIntToPtr(IntToPtrInst &I); 144 bool visitCastInst(CastInst &I); 145 bool visitUnaryInstruction(UnaryInstruction &I); 146 bool visitCmpInst(CmpInst &I); 147 bool visitSub(BinaryOperator &I); 148 bool visitBinaryOperator(BinaryOperator &I); 149 bool visitLoad(LoadInst &I); 150 bool visitStore(StoreInst &I); 151 bool visitExtractValue(ExtractValueInst &I); 152 bool visitInsertValue(InsertValueInst &I); 153 bool visitCallSite(CallSite CS); 154 bool visitReturnInst(ReturnInst &RI); 155 bool visitBranchInst(BranchInst &BI); 156 bool visitSwitchInst(SwitchInst &SI); 157 bool visitIndirectBrInst(IndirectBrInst &IBI); 158 bool visitResumeInst(ResumeInst &RI); 159 bool visitCleanupReturnInst(CleanupReturnInst &RI); 160 bool visitCatchReturnInst(CatchReturnInst &RI); 161 bool visitUnreachableInst(UnreachableInst &I); 162 163 public: 164 CallAnalyzer(const TargetTransformInfo &TTI, AssumptionCacheTracker *ACT, 165 Function &Callee, int Threshold, CallSite CSArg) 166 : TTI(TTI), ACT(ACT), F(Callee), CandidateCS(CSArg), Threshold(Threshold), 167 Cost(0), IsCallerRecursive(false), IsRecursiveCall(false), 168 ExposesReturnsTwice(false), HasDynamicAlloca(false), 169 ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), 170 HasFrameEscape(false), AllocatedSize(0), NumInstructions(0), 171 NumVectorInstructions(0), FiftyPercentVectorBonus(0), 172 TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0), 173 NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0), 174 NumConstantPtrDiffs(0), NumInstructionsSimplified(0), 175 SROACostSavings(0), SROACostSavingsLost(0) {} 176 177 bool analyzeCall(CallSite CS); 178 179 int getThreshold() { return Threshold; } 180 int getCost() { return Cost; } 181 182 // Keep a bunch of stats about the cost savings found so we can print them 183 // out when debugging. 184 unsigned NumConstantArgs; 185 unsigned NumConstantOffsetPtrArgs; 186 unsigned NumAllocaArgs; 187 unsigned NumConstantPtrCmps; 188 unsigned NumConstantPtrDiffs; 189 unsigned NumInstructionsSimplified; 190 unsigned SROACostSavings; 191 unsigned SROACostSavingsLost; 192 193 void dump(); 194 }; 195 196 } // namespace 197 198 /// \brief Test whether the given value is an Alloca-derived function argument. 199 bool CallAnalyzer::isAllocaDerivedArg(Value *V) { 200 return SROAArgValues.count(V); 201 } 202 203 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to. 204 /// Returns false if V does not map to a SROA-candidate. 205 bool CallAnalyzer::lookupSROAArgAndCost( 206 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) { 207 if (SROAArgValues.empty() || SROAArgCosts.empty()) 208 return false; 209 210 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V); 211 if (ArgIt == SROAArgValues.end()) 212 return false; 213 214 Arg = ArgIt->second; 215 CostIt = SROAArgCosts.find(Arg); 216 return CostIt != SROAArgCosts.end(); 217 } 218 219 /// \brief Disable SROA for the candidate marked by this cost iterator. 220 /// 221 /// This marks the candidate as no longer viable for SROA, and adds the cost 222 /// savings associated with it back into the inline cost measurement. 223 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) { 224 // If we're no longer able to perform SROA we need to undo its cost savings 225 // and prevent subsequent analysis. 226 Cost += CostIt->second; 227 SROACostSavings -= CostIt->second; 228 SROACostSavingsLost += CostIt->second; 229 SROAArgCosts.erase(CostIt); 230 } 231 232 /// \brief If 'V' maps to a SROA candidate, disable SROA for it. 233 void CallAnalyzer::disableSROA(Value *V) { 234 Value *SROAArg; 235 DenseMap<Value *, int>::iterator CostIt; 236 if (lookupSROAArgAndCost(V, SROAArg, CostIt)) 237 disableSROA(CostIt); 238 } 239 240 /// \brief Accumulate the given cost for a particular SROA candidate. 241 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 242 int InstructionCost) { 243 CostIt->second += InstructionCost; 244 SROACostSavings += InstructionCost; 245 } 246 247 /// \brief Check whether a GEP's indices are all constant. 248 /// 249 /// Respects any simplified values known during the analysis of this callsite. 250 bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) { 251 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 252 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I)) 253 return false; 254 255 return true; 256 } 257 258 /// \brief Accumulate a constant GEP offset into an APInt if possible. 259 /// 260 /// Returns false if unable to compute the offset for any reason. Respects any 261 /// simplified values known during the analysis of this callsite. 262 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { 263 const DataLayout &DL = F.getParent()->getDataLayout(); 264 unsigned IntPtrWidth = DL.getPointerSizeInBits(); 265 assert(IntPtrWidth == Offset.getBitWidth()); 266 267 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); 268 GTI != GTE; ++GTI) { 269 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 270 if (!OpC) 271 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand())) 272 OpC = dyn_cast<ConstantInt>(SimpleOp); 273 if (!OpC) 274 return false; 275 if (OpC->isZero()) continue; 276 277 // Handle a struct index, which adds its field offset to the pointer. 278 if (StructType *STy = dyn_cast<StructType>(*GTI)) { 279 unsigned ElementIdx = OpC->getZExtValue(); 280 const StructLayout *SL = DL.getStructLayout(STy); 281 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx)); 282 continue; 283 } 284 285 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType())); 286 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize; 287 } 288 return true; 289 } 290 291 bool CallAnalyzer::visitAlloca(AllocaInst &I) { 292 // Check whether inlining will turn a dynamic alloca into a static 293 // alloca, and handle that case. 294 if (I.isArrayAllocation()) { 295 if (Constant *Size = SimplifiedValues.lookup(I.getArraySize())) { 296 ConstantInt *AllocSize = dyn_cast<ConstantInt>(Size); 297 assert(AllocSize && "Allocation size not a constant int?"); 298 Type *Ty = I.getAllocatedType(); 299 AllocatedSize += Ty->getPrimitiveSizeInBits() * AllocSize->getZExtValue(); 300 return Base::visitAlloca(I); 301 } 302 } 303 304 // Accumulate the allocated size. 305 if (I.isStaticAlloca()) { 306 const DataLayout &DL = F.getParent()->getDataLayout(); 307 Type *Ty = I.getAllocatedType(); 308 AllocatedSize += DL.getTypeAllocSize(Ty); 309 } 310 311 // We will happily inline static alloca instructions. 312 if (I.isStaticAlloca()) 313 return Base::visitAlloca(I); 314 315 // FIXME: This is overly conservative. Dynamic allocas are inefficient for 316 // a variety of reasons, and so we would like to not inline them into 317 // functions which don't currently have a dynamic alloca. This simply 318 // disables inlining altogether in the presence of a dynamic alloca. 319 HasDynamicAlloca = true; 320 return false; 321 } 322 323 bool CallAnalyzer::visitPHI(PHINode &I) { 324 // FIXME: We should potentially be tracking values through phi nodes, 325 // especially when they collapse to a single value due to deleted CFG edges 326 // during inlining. 327 328 // FIXME: We need to propagate SROA *disabling* through phi nodes, even 329 // though we don't want to propagate it's bonuses. The idea is to disable 330 // SROA if it *might* be used in an inappropriate manner. 331 332 // Phi nodes are always zero-cost. 333 return true; 334 } 335 336 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { 337 Value *SROAArg; 338 DenseMap<Value *, int>::iterator CostIt; 339 bool SROACandidate = lookupSROAArgAndCost(I.getPointerOperand(), 340 SROAArg, CostIt); 341 342 // Try to fold GEPs of constant-offset call site argument pointers. This 343 // requires target data and inbounds GEPs. 344 if (I.isInBounds()) { 345 // Check if we have a base + offset for the pointer. 346 Value *Ptr = I.getPointerOperand(); 347 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr); 348 if (BaseAndOffset.first) { 349 // Check if the offset of this GEP is constant, and if so accumulate it 350 // into Offset. 351 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) { 352 // Non-constant GEPs aren't folded, and disable SROA. 353 if (SROACandidate) 354 disableSROA(CostIt); 355 return false; 356 } 357 358 // Add the result as a new mapping to Base + Offset. 359 ConstantOffsetPtrs[&I] = BaseAndOffset; 360 361 // Also handle SROA candidates here, we already know that the GEP is 362 // all-constant indexed. 363 if (SROACandidate) 364 SROAArgValues[&I] = SROAArg; 365 366 return true; 367 } 368 } 369 370 if (isGEPOffsetConstant(I)) { 371 if (SROACandidate) 372 SROAArgValues[&I] = SROAArg; 373 374 // Constant GEPs are modeled as free. 375 return true; 376 } 377 378 // Variable GEPs will require math and will disable SROA. 379 if (SROACandidate) 380 disableSROA(CostIt); 381 return false; 382 } 383 384 bool CallAnalyzer::visitBitCast(BitCastInst &I) { 385 // Propagate constants through bitcasts. 386 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 387 if (!COp) 388 COp = SimplifiedValues.lookup(I.getOperand(0)); 389 if (COp) 390 if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) { 391 SimplifiedValues[&I] = C; 392 return true; 393 } 394 395 // Track base/offsets through casts 396 std::pair<Value *, APInt> BaseAndOffset 397 = ConstantOffsetPtrs.lookup(I.getOperand(0)); 398 // Casts don't change the offset, just wrap it up. 399 if (BaseAndOffset.first) 400 ConstantOffsetPtrs[&I] = BaseAndOffset; 401 402 // Also look for SROA candidates here. 403 Value *SROAArg; 404 DenseMap<Value *, int>::iterator CostIt; 405 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 406 SROAArgValues[&I] = SROAArg; 407 408 // Bitcasts are always zero cost. 409 return true; 410 } 411 412 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { 413 // Propagate constants through ptrtoint. 414 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 415 if (!COp) 416 COp = SimplifiedValues.lookup(I.getOperand(0)); 417 if (COp) 418 if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) { 419 SimplifiedValues[&I] = C; 420 return true; 421 } 422 423 // Track base/offset pairs when converted to a plain integer provided the 424 // integer is large enough to represent the pointer. 425 unsigned IntegerSize = I.getType()->getScalarSizeInBits(); 426 const DataLayout &DL = F.getParent()->getDataLayout(); 427 if (IntegerSize >= DL.getPointerSizeInBits()) { 428 std::pair<Value *, APInt> BaseAndOffset 429 = ConstantOffsetPtrs.lookup(I.getOperand(0)); 430 if (BaseAndOffset.first) 431 ConstantOffsetPtrs[&I] = BaseAndOffset; 432 } 433 434 // This is really weird. Technically, ptrtoint will disable SROA. However, 435 // unless that ptrtoint is *used* somewhere in the live basic blocks after 436 // inlining, it will be nuked, and SROA should proceed. All of the uses which 437 // would block SROA would also block SROA if applied directly to a pointer, 438 // and so we can just add the integer in here. The only places where SROA is 439 // preserved either cannot fire on an integer, or won't in-and-of themselves 440 // disable SROA (ext) w/o some later use that we would see and disable. 441 Value *SROAArg; 442 DenseMap<Value *, int>::iterator CostIt; 443 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 444 SROAArgValues[&I] = SROAArg; 445 446 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 447 } 448 449 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { 450 // Propagate constants through ptrtoint. 451 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 452 if (!COp) 453 COp = SimplifiedValues.lookup(I.getOperand(0)); 454 if (COp) 455 if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) { 456 SimplifiedValues[&I] = C; 457 return true; 458 } 459 460 // Track base/offset pairs when round-tripped through a pointer without 461 // modifications provided the integer is not too large. 462 Value *Op = I.getOperand(0); 463 unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); 464 const DataLayout &DL = F.getParent()->getDataLayout(); 465 if (IntegerSize <= DL.getPointerSizeInBits()) { 466 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op); 467 if (BaseAndOffset.first) 468 ConstantOffsetPtrs[&I] = BaseAndOffset; 469 } 470 471 // "Propagate" SROA here in the same manner as we do for ptrtoint above. 472 Value *SROAArg; 473 DenseMap<Value *, int>::iterator CostIt; 474 if (lookupSROAArgAndCost(Op, SROAArg, CostIt)) 475 SROAArgValues[&I] = SROAArg; 476 477 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 478 } 479 480 bool CallAnalyzer::visitCastInst(CastInst &I) { 481 // Propagate constants through ptrtoint. 482 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 483 if (!COp) 484 COp = SimplifiedValues.lookup(I.getOperand(0)); 485 if (COp) 486 if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) { 487 SimplifiedValues[&I] = C; 488 return true; 489 } 490 491 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. 492 disableSROA(I.getOperand(0)); 493 494 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 495 } 496 497 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { 498 Value *Operand = I.getOperand(0); 499 Constant *COp = dyn_cast<Constant>(Operand); 500 if (!COp) 501 COp = SimplifiedValues.lookup(Operand); 502 if (COp) { 503 const DataLayout &DL = F.getParent()->getDataLayout(); 504 if (Constant *C = ConstantFoldInstOperands(I.getOpcode(), I.getType(), 505 COp, DL)) { 506 SimplifiedValues[&I] = C; 507 return true; 508 } 509 } 510 511 // Disable any SROA on the argument to arbitrary unary operators. 512 disableSROA(Operand); 513 514 return false; 515 } 516 517 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { 518 unsigned ArgNo = A->getArgNo(); 519 return CandidateCS.paramHasAttr(ArgNo+1, Attr); 520 } 521 522 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { 523 // Does the *call site* have the NonNull attribute set on an argument? We 524 // use the attribute on the call site to memoize any analysis done in the 525 // caller. This will also trip if the callee function has a non-null 526 // parameter attribute, but that's a less interesting case because hopefully 527 // the callee would already have been simplified based on that. 528 if (Argument *A = dyn_cast<Argument>(V)) 529 if (paramHasAttr(A, Attribute::NonNull)) 530 return true; 531 532 // Is this an alloca in the caller? This is distinct from the attribute case 533 // above because attributes aren't updated within the inliner itself and we 534 // always want to catch the alloca derived case. 535 if (isAllocaDerivedArg(V)) 536 // We can actually predict the result of comparisons between an 537 // alloca-derived value and null. Note that this fires regardless of 538 // SROA firing. 539 return true; 540 541 return false; 542 } 543 544 bool CallAnalyzer::visitCmpInst(CmpInst &I) { 545 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 546 // First try to handle simplified comparisons. 547 if (!isa<Constant>(LHS)) 548 if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) 549 LHS = SimpleLHS; 550 if (!isa<Constant>(RHS)) 551 if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) 552 RHS = SimpleRHS; 553 if (Constant *CLHS = dyn_cast<Constant>(LHS)) { 554 if (Constant *CRHS = dyn_cast<Constant>(RHS)) 555 if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) { 556 SimplifiedValues[&I] = C; 557 return true; 558 } 559 } 560 561 if (I.getOpcode() == Instruction::FCmp) 562 return false; 563 564 // Otherwise look for a comparison between constant offset pointers with 565 // a common base. 566 Value *LHSBase, *RHSBase; 567 APInt LHSOffset, RHSOffset; 568 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 569 if (LHSBase) { 570 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 571 if (RHSBase && LHSBase == RHSBase) { 572 // We have common bases, fold the icmp to a constant based on the 573 // offsets. 574 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 575 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 576 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) { 577 SimplifiedValues[&I] = C; 578 ++NumConstantPtrCmps; 579 return true; 580 } 581 } 582 } 583 584 // If the comparison is an equality comparison with null, we can simplify it 585 // if we know the value (argument) can't be null 586 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) && 587 isKnownNonNullInCallee(I.getOperand(0))) { 588 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; 589 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType()) 590 : ConstantInt::getFalse(I.getType()); 591 return true; 592 } 593 // Finally check for SROA candidates in comparisons. 594 Value *SROAArg; 595 DenseMap<Value *, int>::iterator CostIt; 596 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) { 597 if (isa<ConstantPointerNull>(I.getOperand(1))) { 598 accumulateSROACost(CostIt, InlineConstants::InstrCost); 599 return true; 600 } 601 602 disableSROA(CostIt); 603 } 604 605 return false; 606 } 607 608 bool CallAnalyzer::visitSub(BinaryOperator &I) { 609 // Try to handle a special case: we can fold computing the difference of two 610 // constant-related pointers. 611 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 612 Value *LHSBase, *RHSBase; 613 APInt LHSOffset, RHSOffset; 614 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 615 if (LHSBase) { 616 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 617 if (RHSBase && LHSBase == RHSBase) { 618 // We have common bases, fold the subtract to a constant based on the 619 // offsets. 620 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 621 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 622 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) { 623 SimplifiedValues[&I] = C; 624 ++NumConstantPtrDiffs; 625 return true; 626 } 627 } 628 } 629 630 // Otherwise, fall back to the generic logic for simplifying and handling 631 // instructions. 632 return Base::visitSub(I); 633 } 634 635 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { 636 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 637 const DataLayout &DL = F.getParent()->getDataLayout(); 638 if (!isa<Constant>(LHS)) 639 if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) 640 LHS = SimpleLHS; 641 if (!isa<Constant>(RHS)) 642 if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) 643 RHS = SimpleRHS; 644 Value *SimpleV = nullptr; 645 if (auto FI = dyn_cast<FPMathOperator>(&I)) 646 SimpleV = 647 SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL); 648 else 649 SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL); 650 651 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) { 652 SimplifiedValues[&I] = C; 653 return true; 654 } 655 656 // Disable any SROA on arguments to arbitrary, unsimplified binary operators. 657 disableSROA(LHS); 658 disableSROA(RHS); 659 660 return false; 661 } 662 663 bool CallAnalyzer::visitLoad(LoadInst &I) { 664 Value *SROAArg; 665 DenseMap<Value *, int>::iterator CostIt; 666 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 667 if (I.isSimple()) { 668 accumulateSROACost(CostIt, InlineConstants::InstrCost); 669 return true; 670 } 671 672 disableSROA(CostIt); 673 } 674 675 return false; 676 } 677 678 bool CallAnalyzer::visitStore(StoreInst &I) { 679 Value *SROAArg; 680 DenseMap<Value *, int>::iterator CostIt; 681 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 682 if (I.isSimple()) { 683 accumulateSROACost(CostIt, InlineConstants::InstrCost); 684 return true; 685 } 686 687 disableSROA(CostIt); 688 } 689 690 return false; 691 } 692 693 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) { 694 // Constant folding for extract value is trivial. 695 Constant *C = dyn_cast<Constant>(I.getAggregateOperand()); 696 if (!C) 697 C = SimplifiedValues.lookup(I.getAggregateOperand()); 698 if (C) { 699 SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices()); 700 return true; 701 } 702 703 // SROA can look through these but give them a cost. 704 return false; 705 } 706 707 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { 708 // Constant folding for insert value is trivial. 709 Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand()); 710 if (!AggC) 711 AggC = SimplifiedValues.lookup(I.getAggregateOperand()); 712 Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand()); 713 if (!InsertedC) 714 InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand()); 715 if (AggC && InsertedC) { 716 SimplifiedValues[&I] = ConstantExpr::getInsertValue(AggC, InsertedC, 717 I.getIndices()); 718 return true; 719 } 720 721 // SROA can look through these but give them a cost. 722 return false; 723 } 724 725 /// \brief Try to simplify a call site. 726 /// 727 /// Takes a concrete function and callsite and tries to actually simplify it by 728 /// analyzing the arguments and call itself with instsimplify. Returns true if 729 /// it has simplified the callsite to some other entity (a constant), making it 730 /// free. 731 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) { 732 // FIXME: Using the instsimplify logic directly for this is inefficient 733 // because we have to continually rebuild the argument list even when no 734 // simplifications can be performed. Until that is fixed with remapping 735 // inside of instsimplify, directly constant fold calls here. 736 if (!canConstantFoldCallTo(F)) 737 return false; 738 739 // Try to re-map the arguments to constants. 740 SmallVector<Constant *, 4> ConstantArgs; 741 ConstantArgs.reserve(CS.arg_size()); 742 for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); 743 I != E; ++I) { 744 Constant *C = dyn_cast<Constant>(*I); 745 if (!C) 746 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I)); 747 if (!C) 748 return false; // This argument doesn't map to a constant. 749 750 ConstantArgs.push_back(C); 751 } 752 if (Constant *C = ConstantFoldCall(F, ConstantArgs)) { 753 SimplifiedValues[CS.getInstruction()] = C; 754 return true; 755 } 756 757 return false; 758 } 759 760 bool CallAnalyzer::visitCallSite(CallSite CS) { 761 if (CS.hasFnAttr(Attribute::ReturnsTwice) && 762 !F.hasFnAttribute(Attribute::ReturnsTwice)) { 763 // This aborts the entire analysis. 764 ExposesReturnsTwice = true; 765 return false; 766 } 767 if (CS.isCall() && 768 cast<CallInst>(CS.getInstruction())->cannotDuplicate()) 769 ContainsNoDuplicateCall = true; 770 771 if (Function *F = CS.getCalledFunction()) { 772 // When we have a concrete function, first try to simplify it directly. 773 if (simplifyCallSite(F, CS)) 774 return true; 775 776 // Next check if it is an intrinsic we know about. 777 // FIXME: Lift this into part of the InstVisitor. 778 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) { 779 switch (II->getIntrinsicID()) { 780 default: 781 return Base::visitCallSite(CS); 782 783 case Intrinsic::memset: 784 case Intrinsic::memcpy: 785 case Intrinsic::memmove: 786 // SROA can usually chew through these intrinsics, but they aren't free. 787 return false; 788 case Intrinsic::localescape: 789 HasFrameEscape = true; 790 return false; 791 } 792 } 793 794 if (F == CS.getInstruction()->getParent()->getParent()) { 795 // This flag will fully abort the analysis, so don't bother with anything 796 // else. 797 IsRecursiveCall = true; 798 return false; 799 } 800 801 if (TTI.isLoweredToCall(F)) { 802 // We account for the average 1 instruction per call argument setup 803 // here. 804 Cost += CS.arg_size() * InlineConstants::InstrCost; 805 806 // Everything other than inline ASM will also have a significant cost 807 // merely from making the call. 808 if (!isa<InlineAsm>(CS.getCalledValue())) 809 Cost += InlineConstants::CallPenalty; 810 } 811 812 return Base::visitCallSite(CS); 813 } 814 815 // Otherwise we're in a very special case -- an indirect function call. See 816 // if we can be particularly clever about this. 817 Value *Callee = CS.getCalledValue(); 818 819 // First, pay the price of the argument setup. We account for the average 820 // 1 instruction per call argument setup here. 821 Cost += CS.arg_size() * InlineConstants::InstrCost; 822 823 // Next, check if this happens to be an indirect function call to a known 824 // function in this inline context. If not, we've done all we can. 825 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee)); 826 if (!F) 827 return Base::visitCallSite(CS); 828 829 // If we have a constant that we are calling as a function, we can peer 830 // through it and see the function target. This happens not infrequently 831 // during devirtualization and so we want to give it a hefty bonus for 832 // inlining, but cap that bonus in the event that inlining wouldn't pan 833 // out. Pretend to inline the function, with a custom threshold. 834 CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS); 835 if (CA.analyzeCall(CS)) { 836 // We were able to inline the indirect call! Subtract the cost from the 837 // threshold to get the bonus we want to apply, but don't go below zero. 838 Cost -= std::max(0, CA.getThreshold() - CA.getCost()); 839 } 840 841 return Base::visitCallSite(CS); 842 } 843 844 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { 845 // At least one return instruction will be free after inlining. 846 bool Free = !HasReturn; 847 HasReturn = true; 848 return Free; 849 } 850 851 bool CallAnalyzer::visitBranchInst(BranchInst &BI) { 852 // We model unconditional branches as essentially free -- they really 853 // shouldn't exist at all, but handling them makes the behavior of the 854 // inliner more regular and predictable. Interestingly, conditional branches 855 // which will fold away are also free. 856 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) || 857 dyn_cast_or_null<ConstantInt>( 858 SimplifiedValues.lookup(BI.getCondition())); 859 } 860 861 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { 862 // We model unconditional switches as free, see the comments on handling 863 // branches. 864 if (isa<ConstantInt>(SI.getCondition())) 865 return true; 866 if (Value *V = SimplifiedValues.lookup(SI.getCondition())) 867 if (isa<ConstantInt>(V)) 868 return true; 869 870 // Otherwise, we need to accumulate a cost proportional to the number of 871 // distinct successor blocks. This fan-out in the CFG cannot be represented 872 // for free even if we can represent the core switch as a jumptable that 873 // takes a single instruction. 874 // 875 // NB: We convert large switches which are just used to initialize large phi 876 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent 877 // inlining those. It will prevent inlining in cases where the optimization 878 // does not (yet) fire. 879 SmallPtrSet<BasicBlock *, 8> SuccessorBlocks; 880 SuccessorBlocks.insert(SI.getDefaultDest()); 881 for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I) 882 SuccessorBlocks.insert(I.getCaseSuccessor()); 883 // Add cost corresponding to the number of distinct destinations. The first 884 // we model as free because of fallthrough. 885 Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost; 886 return false; 887 } 888 889 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { 890 // We never want to inline functions that contain an indirectbr. This is 891 // incorrect because all the blockaddress's (in static global initializers 892 // for example) would be referring to the original function, and this 893 // indirect jump would jump from the inlined copy of the function into the 894 // original function which is extremely undefined behavior. 895 // FIXME: This logic isn't really right; we can safely inline functions with 896 // indirectbr's as long as no other function or global references the 897 // blockaddress of a block within the current function. 898 HasIndirectBr = true; 899 return false; 900 } 901 902 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { 903 // FIXME: It's not clear that a single instruction is an accurate model for 904 // the inline cost of a resume instruction. 905 return false; 906 } 907 908 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { 909 // FIXME: It's not clear that a single instruction is an accurate model for 910 // the inline cost of a cleanupret instruction. 911 return false; 912 } 913 914 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { 915 // FIXME: It's not clear that a single instruction is an accurate model for 916 // the inline cost of a catchret instruction. 917 return false; 918 } 919 920 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { 921 // FIXME: It might be reasonably to discount the cost of instructions leading 922 // to unreachable as they have the lowest possible impact on both runtime and 923 // code size. 924 return true; // No actual code is needed for unreachable. 925 } 926 927 bool CallAnalyzer::visitInstruction(Instruction &I) { 928 // Some instructions are free. All of the free intrinsics can also be 929 // handled by SROA, etc. 930 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I)) 931 return true; 932 933 // We found something we don't understand or can't handle. Mark any SROA-able 934 // values in the operand list as no longer viable. 935 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI) 936 disableSROA(*OI); 937 938 return false; 939 } 940 941 942 /// \brief Analyze a basic block for its contribution to the inline cost. 943 /// 944 /// This method walks the analyzer over every instruction in the given basic 945 /// block and accounts for their cost during inlining at this callsite. It 946 /// aborts early if the threshold has been exceeded or an impossible to inline 947 /// construct has been detected. It returns false if inlining is no longer 948 /// viable, and true if inlining remains viable. 949 bool CallAnalyzer::analyzeBlock(BasicBlock *BB, 950 SmallPtrSetImpl<const Value *> &EphValues) { 951 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 952 // FIXME: Currently, the number of instructions in a function regardless of 953 // our ability to simplify them during inline to constants or dead code, 954 // are actually used by the vector bonus heuristic. As long as that's true, 955 // we have to special case debug intrinsics here to prevent differences in 956 // inlining due to debug symbols. Eventually, the number of unsimplified 957 // instructions shouldn't factor into the cost computation, but until then, 958 // hack around it here. 959 if (isa<DbgInfoIntrinsic>(I)) 960 continue; 961 962 // Skip ephemeral values. 963 if (EphValues.count(&*I)) 964 continue; 965 966 ++NumInstructions; 967 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy()) 968 ++NumVectorInstructions; 969 970 // If the instruction is floating point, and the target says this operation 971 // is expensive or the function has the "use-soft-float" attribute, this may 972 // eventually become a library call. Treat the cost as such. 973 if (I->getType()->isFloatingPointTy()) { 974 bool hasSoftFloatAttr = false; 975 976 // If the function has the "use-soft-float" attribute, mark it as 977 // expensive. 978 if (F.hasFnAttribute("use-soft-float")) { 979 Attribute Attr = F.getFnAttribute("use-soft-float"); 980 StringRef Val = Attr.getValueAsString(); 981 if (Val == "true") 982 hasSoftFloatAttr = true; 983 } 984 985 if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive || 986 hasSoftFloatAttr) 987 Cost += InlineConstants::CallPenalty; 988 } 989 990 // If the instruction simplified to a constant, there is no cost to this 991 // instruction. Visit the instructions using our InstVisitor to account for 992 // all of the per-instruction logic. The visit tree returns true if we 993 // consumed the instruction in any way, and false if the instruction's base 994 // cost should count against inlining. 995 if (Base::visit(&*I)) 996 ++NumInstructionsSimplified; 997 else 998 Cost += InlineConstants::InstrCost; 999 1000 // If the visit this instruction detected an uninlinable pattern, abort. 1001 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca || 1002 HasIndirectBr || HasFrameEscape) 1003 return false; 1004 1005 // If the caller is a recursive function then we don't want to inline 1006 // functions which allocate a lot of stack space because it would increase 1007 // the caller stack usage dramatically. 1008 if (IsCallerRecursive && 1009 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) 1010 return false; 1011 1012 // Check if we've past the maximum possible threshold so we don't spin in 1013 // huge basic blocks that will never inline. 1014 if (Cost > Threshold) 1015 return false; 1016 } 1017 1018 return true; 1019 } 1020 1021 /// \brief Compute the base pointer and cumulative constant offsets for V. 1022 /// 1023 /// This strips all constant offsets off of V, leaving it the base pointer, and 1024 /// accumulates the total constant offset applied in the returned constant. It 1025 /// returns 0 if V is not a pointer, and returns the constant '0' if there are 1026 /// no constant offsets applied. 1027 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { 1028 if (!V->getType()->isPointerTy()) 1029 return nullptr; 1030 1031 const DataLayout &DL = F.getParent()->getDataLayout(); 1032 unsigned IntPtrWidth = DL.getPointerSizeInBits(); 1033 APInt Offset = APInt::getNullValue(IntPtrWidth); 1034 1035 // Even though we don't look through PHI nodes, we could be called on an 1036 // instruction in an unreachable block, which may be on a cycle. 1037 SmallPtrSet<Value *, 4> Visited; 1038 Visited.insert(V); 1039 do { 1040 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { 1041 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset)) 1042 return nullptr; 1043 V = GEP->getPointerOperand(); 1044 } else if (Operator::getOpcode(V) == Instruction::BitCast) { 1045 V = cast<Operator>(V)->getOperand(0); 1046 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) { 1047 if (GA->mayBeOverridden()) 1048 break; 1049 V = GA->getAliasee(); 1050 } else { 1051 break; 1052 } 1053 assert(V->getType()->isPointerTy() && "Unexpected operand type!"); 1054 } while (Visited.insert(V).second); 1055 1056 Type *IntPtrTy = DL.getIntPtrType(V->getContext()); 1057 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset)); 1058 } 1059 1060 /// \brief Analyze a call site for potential inlining. 1061 /// 1062 /// Returns true if inlining this call is viable, and false if it is not 1063 /// viable. It computes the cost and adjusts the threshold based on numerous 1064 /// factors and heuristics. If this method returns false but the computed cost 1065 /// is below the computed threshold, then inlining was forcibly disabled by 1066 /// some artifact of the routine. 1067 bool CallAnalyzer::analyzeCall(CallSite CS) { 1068 ++NumCallsAnalyzed; 1069 1070 // Perform some tweaks to the cost and threshold based on the direct 1071 // callsite information. 1072 1073 // We want to more aggressively inline vector-dense kernels, so up the 1074 // threshold, and we'll lower it if the % of vector instructions gets too 1075 // low. Note that these bonuses are some what arbitrary and evolved over time 1076 // by accident as much as because they are principled bonuses. 1077 // 1078 // FIXME: It would be nice to remove all such bonuses. At least it would be 1079 // nice to base the bonus values on something more scientific. 1080 assert(NumInstructions == 0); 1081 assert(NumVectorInstructions == 0); 1082 FiftyPercentVectorBonus = 3 * Threshold / 2; 1083 TenPercentVectorBonus = 3 * Threshold / 4; 1084 const DataLayout &DL = F.getParent()->getDataLayout(); 1085 1086 // Track whether the post-inlining function would have more than one basic 1087 // block. A single basic block is often intended for inlining. Balloon the 1088 // threshold by 50% until we pass the single-BB phase. 1089 bool SingleBB = true; 1090 int SingleBBBonus = Threshold / 2; 1091 1092 // Speculatively apply all possible bonuses to Threshold. If cost exceeds 1093 // this Threshold any time, and cost cannot decrease, we can stop processing 1094 // the rest of the function body. 1095 Threshold += (SingleBBBonus + FiftyPercentVectorBonus); 1096 1097 // Give out bonuses per argument, as the instructions setting them up will 1098 // be gone after inlining. 1099 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) { 1100 if (CS.isByValArgument(I)) { 1101 // We approximate the number of loads and stores needed by dividing the 1102 // size of the byval type by the target's pointer size. 1103 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType()); 1104 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType()); 1105 unsigned PointerSize = DL.getPointerSizeInBits(); 1106 // Ceiling division. 1107 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; 1108 1109 // If it generates more than 8 stores it is likely to be expanded as an 1110 // inline memcpy so we take that as an upper bound. Otherwise we assume 1111 // one load and one store per word copied. 1112 // FIXME: The maxStoresPerMemcpy setting from the target should be used 1113 // here instead of a magic number of 8, but it's not available via 1114 // DataLayout. 1115 NumStores = std::min(NumStores, 8U); 1116 1117 Cost -= 2 * NumStores * InlineConstants::InstrCost; 1118 } else { 1119 // For non-byval arguments subtract off one instruction per call 1120 // argument. 1121 Cost -= InlineConstants::InstrCost; 1122 } 1123 } 1124 1125 // If there is only one call of the function, and it has internal linkage, 1126 // the cost of inlining it drops dramatically. 1127 bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() && 1128 &F == CS.getCalledFunction(); 1129 if (OnlyOneCallAndLocalLinkage) 1130 Cost += InlineConstants::LastCallToStaticBonus; 1131 1132 // If the instruction after the call, or if the normal destination of the 1133 // invoke is an unreachable instruction, the function is noreturn. As such, 1134 // there is little point in inlining this unless there is literally zero 1135 // cost. 1136 Instruction *Instr = CS.getInstruction(); 1137 if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) { 1138 if (isa<UnreachableInst>(II->getNormalDest()->begin())) 1139 Threshold = 0; 1140 } else if (isa<UnreachableInst>(++BasicBlock::iterator(Instr))) 1141 Threshold = 0; 1142 1143 // If this function uses the coldcc calling convention, prefer not to inline 1144 // it. 1145 if (F.getCallingConv() == CallingConv::Cold) 1146 Cost += InlineConstants::ColdccPenalty; 1147 1148 // Check if we're done. This can happen due to bonuses and penalties. 1149 if (Cost > Threshold) 1150 return false; 1151 1152 if (F.empty()) 1153 return true; 1154 1155 Function *Caller = CS.getInstruction()->getParent()->getParent(); 1156 // Check if the caller function is recursive itself. 1157 for (User *U : Caller->users()) { 1158 CallSite Site(U); 1159 if (!Site) 1160 continue; 1161 Instruction *I = Site.getInstruction(); 1162 if (I->getParent()->getParent() == Caller) { 1163 IsCallerRecursive = true; 1164 break; 1165 } 1166 } 1167 1168 // Populate our simplified values by mapping from function arguments to call 1169 // arguments with known important simplifications. 1170 CallSite::arg_iterator CAI = CS.arg_begin(); 1171 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end(); 1172 FAI != FAE; ++FAI, ++CAI) { 1173 assert(CAI != CS.arg_end()); 1174 if (Constant *C = dyn_cast<Constant>(CAI)) 1175 SimplifiedValues[&*FAI] = C; 1176 1177 Value *PtrArg = *CAI; 1178 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) { 1179 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue()); 1180 1181 // We can SROA any pointer arguments derived from alloca instructions. 1182 if (isa<AllocaInst>(PtrArg)) { 1183 SROAArgValues[&*FAI] = PtrArg; 1184 SROAArgCosts[PtrArg] = 0; 1185 } 1186 } 1187 } 1188 NumConstantArgs = SimplifiedValues.size(); 1189 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); 1190 NumAllocaArgs = SROAArgValues.size(); 1191 1192 // FIXME: If a caller has multiple calls to a callee, we end up recomputing 1193 // the ephemeral values multiple times (and they're completely determined by 1194 // the callee, so this is purely duplicate work). 1195 SmallPtrSet<const Value *, 32> EphValues; 1196 CodeMetrics::collectEphemeralValues(&F, &ACT->getAssumptionCache(F), EphValues); 1197 1198 // The worklist of live basic blocks in the callee *after* inlining. We avoid 1199 // adding basic blocks of the callee which can be proven to be dead for this 1200 // particular call site in order to get more accurate cost estimates. This 1201 // requires a somewhat heavyweight iteration pattern: we need to walk the 1202 // basic blocks in a breadth-first order as we insert live successors. To 1203 // accomplish this, prioritizing for small iterations because we exit after 1204 // crossing our threshold, we use a small-size optimized SetVector. 1205 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>, 1206 SmallPtrSet<BasicBlock *, 16> > BBSetVector; 1207 BBSetVector BBWorklist; 1208 BBWorklist.insert(&F.getEntryBlock()); 1209 // Note that we *must not* cache the size, this loop grows the worklist. 1210 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { 1211 // Bail out the moment we cross the threshold. This means we'll under-count 1212 // the cost, but only when undercounting doesn't matter. 1213 if (Cost > Threshold) 1214 break; 1215 1216 BasicBlock *BB = BBWorklist[Idx]; 1217 if (BB->empty()) 1218 continue; 1219 1220 // Disallow inlining a blockaddress. A blockaddress only has defined 1221 // behavior for an indirect branch in the same function, and we do not 1222 // currently support inlining indirect branches. But, the inliner may not 1223 // see an indirect branch that ends up being dead code at a particular call 1224 // site. If the blockaddress escapes the function, e.g., via a global 1225 // variable, inlining may lead to an invalid cross-function reference. 1226 if (BB->hasAddressTaken()) 1227 return false; 1228 1229 // Analyze the cost of this block. If we blow through the threshold, this 1230 // returns false, and we can bail on out. 1231 if (!analyzeBlock(BB, EphValues)) { 1232 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca || 1233 HasIndirectBr || HasFrameEscape) 1234 return false; 1235 1236 // If the caller is a recursive function then we don't want to inline 1237 // functions which allocate a lot of stack space because it would increase 1238 // the caller stack usage dramatically. 1239 if (IsCallerRecursive && 1240 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) 1241 return false; 1242 1243 break; 1244 } 1245 1246 TerminatorInst *TI = BB->getTerminator(); 1247 1248 // Add in the live successors by first checking whether we have terminator 1249 // that may be simplified based on the values simplified by this call. 1250 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 1251 if (BI->isConditional()) { 1252 Value *Cond = BI->getCondition(); 1253 if (ConstantInt *SimpleCond 1254 = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1255 BBWorklist.insert(BI->getSuccessor(SimpleCond->isZero() ? 1 : 0)); 1256 continue; 1257 } 1258 } 1259 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 1260 Value *Cond = SI->getCondition(); 1261 if (ConstantInt *SimpleCond 1262 = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1263 BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor()); 1264 continue; 1265 } 1266 } 1267 1268 // If we're unable to select a particular successor, just count all of 1269 // them. 1270 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize; 1271 ++TIdx) 1272 BBWorklist.insert(TI->getSuccessor(TIdx)); 1273 1274 // If we had any successors at this point, than post-inlining is likely to 1275 // have them as well. Note that we assume any basic blocks which existed 1276 // due to branches or switches which folded above will also fold after 1277 // inlining. 1278 if (SingleBB && TI->getNumSuccessors() > 1) { 1279 // Take off the bonus we applied to the threshold. 1280 Threshold -= SingleBBBonus; 1281 SingleBB = false; 1282 } 1283 } 1284 1285 // If this is a noduplicate call, we can still inline as long as 1286 // inlining this would cause the removal of the caller (so the instruction 1287 // is not actually duplicated, just moved). 1288 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) 1289 return false; 1290 1291 // We applied the maximum possible vector bonus at the beginning. Now, 1292 // subtract the excess bonus, if any, from the Threshold before 1293 // comparing against Cost. 1294 if (NumVectorInstructions <= NumInstructions / 10) 1295 Threshold -= FiftyPercentVectorBonus; 1296 else if (NumVectorInstructions <= NumInstructions / 2) 1297 Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus); 1298 1299 return Cost <= std::max(0, Threshold); 1300 } 1301 1302 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1303 /// \brief Dump stats about this call's analysis. 1304 void CallAnalyzer::dump() { 1305 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n" 1306 DEBUG_PRINT_STAT(NumConstantArgs); 1307 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); 1308 DEBUG_PRINT_STAT(NumAllocaArgs); 1309 DEBUG_PRINT_STAT(NumConstantPtrCmps); 1310 DEBUG_PRINT_STAT(NumConstantPtrDiffs); 1311 DEBUG_PRINT_STAT(NumInstructionsSimplified); 1312 DEBUG_PRINT_STAT(NumInstructions); 1313 DEBUG_PRINT_STAT(SROACostSavings); 1314 DEBUG_PRINT_STAT(SROACostSavingsLost); 1315 DEBUG_PRINT_STAT(ContainsNoDuplicateCall); 1316 DEBUG_PRINT_STAT(Cost); 1317 DEBUG_PRINT_STAT(Threshold); 1318 #undef DEBUG_PRINT_STAT 1319 } 1320 #endif 1321 1322 INITIALIZE_PASS_BEGIN(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis", 1323 true, true) 1324 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 1325 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 1326 INITIALIZE_PASS_END(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis", 1327 true, true) 1328 1329 char InlineCostAnalysis::ID = 0; 1330 1331 InlineCostAnalysis::InlineCostAnalysis() : CallGraphSCCPass(ID) {} 1332 1333 InlineCostAnalysis::~InlineCostAnalysis() {} 1334 1335 void InlineCostAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { 1336 AU.setPreservesAll(); 1337 AU.addRequired<AssumptionCacheTracker>(); 1338 AU.addRequired<TargetTransformInfoWrapperPass>(); 1339 CallGraphSCCPass::getAnalysisUsage(AU); 1340 } 1341 1342 bool InlineCostAnalysis::runOnSCC(CallGraphSCC &SCC) { 1343 TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>(); 1344 ACT = &getAnalysis<AssumptionCacheTracker>(); 1345 return false; 1346 } 1347 1348 InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, int Threshold) { 1349 return getInlineCost(CS, CS.getCalledFunction(), Threshold); 1350 } 1351 1352 /// \brief Test that two functions either have or have not the given attribute 1353 /// at the same time. 1354 template<typename AttrKind> 1355 static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) { 1356 return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr); 1357 } 1358 1359 /// \brief Test that there are no attribute conflicts between Caller and Callee 1360 /// that prevent inlining. 1361 static bool functionsHaveCompatibleAttributes(Function *Caller, 1362 Function *Callee, 1363 TargetTransformInfo &TTI) { 1364 return TTI.areInlineCompatible(Caller, Callee) && 1365 attributeMatches(Caller, Callee, Attribute::SanitizeAddress) && 1366 attributeMatches(Caller, Callee, Attribute::SanitizeMemory) && 1367 attributeMatches(Caller, Callee, Attribute::SanitizeThread); 1368 } 1369 1370 InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee, 1371 int Threshold) { 1372 // Cannot inline indirect calls. 1373 if (!Callee) 1374 return llvm::InlineCost::getNever(); 1375 1376 // Calls to functions with always-inline attributes should be inlined 1377 // whenever possible. 1378 if (CS.hasFnAttr(Attribute::AlwaysInline)) { 1379 if (isInlineViable(*Callee)) 1380 return llvm::InlineCost::getAlways(); 1381 return llvm::InlineCost::getNever(); 1382 } 1383 1384 // Never inline functions with conflicting attributes (unless callee has 1385 // always-inline attribute). 1386 if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee, 1387 TTIWP->getTTI(*Callee))) 1388 return llvm::InlineCost::getNever(); 1389 1390 // Don't inline this call if the caller has the optnone attribute. 1391 if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone)) 1392 return llvm::InlineCost::getNever(); 1393 1394 // Don't inline functions which can be redefined at link-time to mean 1395 // something else. Don't inline functions marked noinline or call sites 1396 // marked noinline. 1397 if (Callee->mayBeOverridden() || 1398 Callee->hasFnAttribute(Attribute::NoInline) || CS.isNoInline()) 1399 return llvm::InlineCost::getNever(); 1400 1401 DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() 1402 << "...\n"); 1403 1404 CallAnalyzer CA(TTIWP->getTTI(*Callee), ACT, *Callee, Threshold, CS); 1405 bool ShouldInline = CA.analyzeCall(CS); 1406 1407 DEBUG(CA.dump()); 1408 1409 // Check if there was a reason to force inlining or no inlining. 1410 if (!ShouldInline && CA.getCost() < CA.getThreshold()) 1411 return InlineCost::getNever(); 1412 if (ShouldInline && CA.getCost() >= CA.getThreshold()) 1413 return InlineCost::getAlways(); 1414 1415 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold()); 1416 } 1417 1418 bool InlineCostAnalysis::isInlineViable(Function &F) { 1419 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice); 1420 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { 1421 // Disallow inlining of functions which contain indirect branches or 1422 // blockaddresses. 1423 if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken()) 1424 return false; 1425 1426 for (auto &II : *BI) { 1427 CallSite CS(&II); 1428 if (!CS) 1429 continue; 1430 1431 // Disallow recursive calls. 1432 if (&F == CS.getCalledFunction()) 1433 return false; 1434 1435 // Disallow calls which expose returns-twice to a function not previously 1436 // attributed as such. 1437 if (!ReturnsTwice && CS.isCall() && 1438 cast<CallInst>(CS.getInstruction())->canReturnTwice()) 1439 return false; 1440 1441 // Disallow inlining functions that call @llvm.localescape. Doing this 1442 // correctly would require major changes to the inliner. 1443 if (CS.getCalledFunction() && 1444 CS.getCalledFunction()->getIntrinsicID() == 1445 llvm::Intrinsic::localescape) 1446 return false; 1447 } 1448 } 1449 1450 return true; 1451 } 1452