1 //===- AMDGPUInline.cpp - Code to perform simple function inlining --------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// This is AMDGPU specific replacement of the standard inliner. 12 /// The main purpose is to account for the fact that calls not only expensive 13 /// on the AMDGPU, but much more expensive if a private memory pointer is 14 /// passed to a function as an argument. In this situation, we are unable to 15 /// eliminate private memory in the caller unless inlined and end up with slow 16 /// and expensive scratch access. Thus, we boost the inline threshold for such 17 /// functions here. 18 /// 19 //===----------------------------------------------------------------------===// 20 21 22 #include "AMDGPU.h" 23 #include "llvm/Transforms/IPO.h" 24 #include "llvm/Analysis/AssumptionCache.h" 25 #include "llvm/Analysis/CallGraph.h" 26 #include "llvm/Analysis/InlineCost.h" 27 #include "llvm/Analysis/ValueTracking.h" 28 #include "llvm/Analysis/TargetTransformInfo.h" 29 #include "llvm/IR/CallSite.h" 30 #include "llvm/IR/DataLayout.h" 31 #include "llvm/IR/Instructions.h" 32 #include "llvm/IR/Module.h" 33 #include "llvm/IR/Type.h" 34 #include "llvm/Support/CommandLine.h" 35 #include "llvm/Support/Debug.h" 36 #include "llvm/Transforms/IPO/Inliner.h" 37 38 using namespace llvm; 39 40 #define DEBUG_TYPE "inline" 41 42 static cl::opt<int> 43 ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200), 44 cl::desc("Cost of alloca argument")); 45 46 // If the amount of scratch memory to eliminate exceeds our ability to allocate 47 // it into registers we gain nothing by agressively inlining functions for that 48 // heuristic. 49 static cl::opt<unsigned> 50 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), 51 cl::desc("Maximum alloca size to use for inline cost")); 52 53 namespace { 54 55 class AMDGPUInliner : public LegacyInlinerBase { 56 57 public: 58 AMDGPUInliner() : LegacyInlinerBase(ID) { 59 initializeAMDGPUInlinerPass(*PassRegistry::getPassRegistry()); 60 Params = getInlineParams(); 61 } 62 63 static char ID; // Pass identification, replacement for typeid 64 65 unsigned getInlineThreshold(CallSite CS) const; 66 67 InlineCost getInlineCost(CallSite CS) override; 68 69 bool runOnSCC(CallGraphSCC &SCC) override; 70 71 void getAnalysisUsage(AnalysisUsage &AU) const override; 72 73 private: 74 TargetTransformInfoWrapperPass *TTIWP; 75 76 InlineParams Params; 77 }; 78 79 } // end anonymous namespace 80 81 char AMDGPUInliner::ID = 0; 82 INITIALIZE_PASS_BEGIN(AMDGPUInliner, "amdgpu-inline", 83 "AMDGPU Function Integration/Inlining", false, false) 84 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 85 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) 86 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 87 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 88 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 89 INITIALIZE_PASS_END(AMDGPUInliner, "amdgpu-inline", 90 "AMDGPU Function Integration/Inlining", false, false) 91 92 Pass *llvm::createAMDGPUFunctionInliningPass() { return new AMDGPUInliner(); } 93 94 bool AMDGPUInliner::runOnSCC(CallGraphSCC &SCC) { 95 TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>(); 96 return LegacyInlinerBase::runOnSCC(SCC); 97 } 98 99 void AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const { 100 AU.addRequired<TargetTransformInfoWrapperPass>(); 101 LegacyInlinerBase::getAnalysisUsage(AU); 102 } 103 104 unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const { 105 int Thres = Params.DefaultThreshold; 106 107 Function *Caller = CS.getCaller(); 108 // Listen to the inlinehint attribute when it would increase the threshold 109 // and the caller does not need to minimize its size. 110 Function *Callee = CS.getCalledFunction(); 111 bool InlineHint = Callee && !Callee->isDeclaration() && 112 Callee->hasFnAttribute(Attribute::InlineHint); 113 if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres 114 && !Caller->hasFnAttribute(Attribute::MinSize)) 115 Thres = Params.HintThreshold.getValue(); 116 117 const DataLayout &DL = Caller->getParent()->getDataLayout(); 118 if (!Callee) 119 return (unsigned)Thres; 120 121 const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*Caller->getParent()); 122 123 // If we have a pointer to private array passed into a function 124 // it will not be optimized out, leaving scratch usage. 125 // Increase the inline threshold to allow inliniting in this case. 126 uint64_t AllocaSize = 0; 127 SmallPtrSet<const AllocaInst *, 8> AIVisited; 128 for (Value *PtrArg : CS.args()) { 129 Type *Ty = PtrArg->getType(); 130 if (!Ty->isPointerTy() || 131 Ty->getPointerAddressSpace() != AS.PRIVATE_ADDRESS) 132 continue; 133 PtrArg = GetUnderlyingObject(PtrArg, DL); 134 if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) { 135 if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second) 136 continue; 137 AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType()); 138 // If the amount of stack memory is excessive we will not be able 139 // to get rid of the scratch anyway, bail out. 140 if (AllocaSize > ArgAllocaCutoff) { 141 AllocaSize = 0; 142 break; 143 } 144 } 145 } 146 if (AllocaSize) 147 Thres += ArgAllocaCost; 148 149 return (unsigned)Thres; 150 } 151 152 // Check if call is just a wrapper around another call. 153 // In this case we only have call and ret instructions. 154 static bool isWrapperOnlyCall(CallSite CS) { 155 Function *Callee = CS.getCalledFunction(); 156 if (!Callee || Callee->size() != 1) 157 return false; 158 const BasicBlock &BB = Callee->getEntryBlock(); 159 if (const Instruction *I = BB.getFirstNonPHI()) { 160 if (!isa<CallInst>(I)) { 161 return false; 162 } 163 if (isa<ReturnInst>(*std::next(I->getIterator()))) { 164 LLVM_DEBUG(dbgs() << " Wrapper only call detected: " 165 << Callee->getName() << '\n'); 166 return true; 167 } 168 } 169 return false; 170 } 171 172 InlineCost AMDGPUInliner::getInlineCost(CallSite CS) { 173 Function *Callee = CS.getCalledFunction(); 174 Function *Caller = CS.getCaller(); 175 TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); 176 177 if (!Callee || Callee->isDeclaration() || CS.isNoInline() || 178 !TTI.areInlineCompatible(Caller, Callee)) 179 return llvm::InlineCost::getNever(); 180 181 if (CS.hasFnAttr(Attribute::AlwaysInline)) { 182 if (isInlineViable(*Callee)) 183 return llvm::InlineCost::getAlways(); 184 return llvm::InlineCost::getNever(); 185 } 186 187 if (isWrapperOnlyCall(CS)) 188 return llvm::InlineCost::getAlways(); 189 190 InlineParams LocalParams = Params; 191 LocalParams.DefaultThreshold = (int)getInlineThreshold(CS); 192 bool RemarksEnabled = false; 193 const auto &BBs = Caller->getBasicBlockList(); 194 if (!BBs.empty()) { 195 auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front()); 196 if (DI.isEnabled()) 197 RemarksEnabled = true; 198 } 199 200 OptimizationRemarkEmitter ORE(Caller); 201 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 202 [this](Function &F) -> AssumptionCache & { 203 return ACT->getAssumptionCache(F); 204 }; 205 206 return llvm::getInlineCost(CS, Callee, LocalParams, TTI, GetAssumptionCache, 207 None, PSI, RemarksEnabled ? &ORE : nullptr); 208 } 209