1 //===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // \file 11 // This file implements a TargetTransformInfo analysis pass specific to the 12 // AMDGPU target machine. It uses the target's detailed information to provide 13 // more precise answers to certain TTI queries, while letting the target 14 // independent and default TTI implementations handle the rest. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "AMDGPUTargetTransformInfo.h" 19 #include "llvm/Analysis/LoopInfo.h" 20 #include "llvm/Analysis/TargetTransformInfo.h" 21 #include "llvm/Analysis/ValueTracking.h" 22 #include "llvm/CodeGen/BasicTTIImpl.h" 23 #include "llvm/IR/Module.h" 24 #include "llvm/IR/Intrinsics.h" 25 #include "llvm/Support/Debug.h" 26 #include "llvm/Target/CostTable.h" 27 #include "llvm/Target/TargetLowering.h" 28 using namespace llvm; 29 30 #define DEBUG_TYPE "AMDGPUtti" 31 32 33 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, 34 TTI::UnrollingPreferences &UP) { 35 UP.Threshold = 300; // Twice the default. 36 UP.MaxCount = UINT_MAX; 37 UP.Partial = true; 38 39 // TODO: Do we want runtime unrolling? 40 41 for (const BasicBlock *BB : L->getBlocks()) { 42 const DataLayout &DL = BB->getModule()->getDataLayout(); 43 for (const Instruction &I : *BB) { 44 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I); 45 if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) 46 continue; 47 48 const Value *Ptr = GEP->getPointerOperand(); 49 const AllocaInst *Alloca = 50 dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL)); 51 if (Alloca) { 52 // We want to do whatever we can to limit the number of alloca 53 // instructions that make it through to the code generator. allocas 54 // require us to use indirect addressing, which is slow and prone to 55 // compiler bugs. If this loop does an address calculation on an 56 // alloca ptr, then we want to use a higher than normal loop unroll 57 // threshold. This will give SROA a better chance to eliminate these 58 // allocas. 59 // 60 // Don't use the maximum allowed value here as it will make some 61 // programs way too big. 62 UP.Threshold = 800; 63 } 64 } 65 } 66 } 67 68 unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) { 69 if (Vec) 70 return 0; 71 72 // Number of VGPRs on SI. 73 if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) 74 return 256; 75 76 return 4 * 128; // XXX - 4 channels. Should these count as vector instead? 77 } 78 79 unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) { 80 return Vector ? 0 : 32; 81 } 82 83 unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) { 84 switch (AddrSpace) { 85 case AMDGPUAS::GLOBAL_ADDRESS: 86 case AMDGPUAS::CONSTANT_ADDRESS: 87 case AMDGPUAS::FLAT_ADDRESS: 88 return 128; 89 case AMDGPUAS::LOCAL_ADDRESS: 90 case AMDGPUAS::REGION_ADDRESS: 91 return 64; 92 case AMDGPUAS::PRIVATE_ADDRESS: 93 return 8 * ST->getMaxPrivateElementSize(); 94 default: 95 if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS && 96 (AddrSpace == AMDGPUAS::PARAM_D_ADDRESS || 97 AddrSpace == AMDGPUAS::PARAM_I_ADDRESS || 98 (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 && 99 AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15))) 100 return 128; 101 llvm_unreachable("unhandled address space"); 102 } 103 } 104 105 unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { 106 // Semi-arbitrary large amount. 107 return 64; 108 } 109 110 int AMDGPUTTIImpl::getArithmeticInstrCost( 111 unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, 112 TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, 113 TTI::OperandValueProperties Opd2PropInfo) { 114 115 EVT OrigTy = TLI->getValueType(DL, Ty); 116 if (!OrigTy.isSimple()) { 117 return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, 118 Opd1PropInfo, Opd2PropInfo); 119 } 120 121 // Legalize the type. 122 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 123 int ISD = TLI->InstructionOpcodeToISD(Opcode); 124 125 // Because we don't have any legal vector operations, but the legal types, we 126 // need to account for split vectors. 127 unsigned NElts = LT.second.isVector() ? 128 LT.second.getVectorNumElements() : 1; 129 130 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; 131 132 switch (ISD) { 133 case ISD::SHL: 134 case ISD::SRL: 135 case ISD::SRA: { 136 if (SLT == MVT::i64) 137 return get64BitInstrCost() * LT.first * NElts; 138 139 // i32 140 return getFullRateInstrCost() * LT.first * NElts; 141 } 142 case ISD::ADD: 143 case ISD::SUB: 144 case ISD::AND: 145 case ISD::OR: 146 case ISD::XOR: { 147 if (SLT == MVT::i64){ 148 // and, or and xor are typically split into 2 VALU instructions. 149 return 2 * getFullRateInstrCost() * LT.first * NElts; 150 } 151 152 return LT.first * NElts * getFullRateInstrCost(); 153 } 154 case ISD::MUL: { 155 const int QuarterRateCost = getQuarterRateInstrCost(); 156 if (SLT == MVT::i64) { 157 const int FullRateCost = getFullRateInstrCost(); 158 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts; 159 } 160 161 // i32 162 return QuarterRateCost * NElts * LT.first; 163 } 164 case ISD::FADD: 165 case ISD::FSUB: 166 case ISD::FMUL: 167 if (SLT == MVT::f64) 168 return LT.first * NElts * get64BitInstrCost(); 169 170 if (SLT == MVT::f32 || SLT == MVT::f16) 171 return LT.first * NElts * getFullRateInstrCost(); 172 break; 173 174 case ISD::FDIV: 175 case ISD::FREM: 176 // FIXME: frem should be handled separately. The fdiv in it is most of it, 177 // but the current lowering is also not entirely correct. 178 if (SLT == MVT::f64) { 179 int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost(); 180 181 // Add cost of workaround. 182 if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) 183 Cost += 3 * getFullRateInstrCost(); 184 185 return LT.first * Cost * NElts; 186 } 187 188 // Assuming no fp32 denormals lowering. 189 if (SLT == MVT::f32 || SLT == MVT::f16) { 190 assert(!ST->hasFP32Denormals() && "will change when supported"); 191 int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost(); 192 return LT.first * NElts * Cost; 193 } 194 195 break; 196 default: 197 break; 198 } 199 200 return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, 201 Opd1PropInfo, Opd2PropInfo); 202 } 203 204 unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) { 205 // XXX - For some reason this isn't called for switch. 206 switch (Opcode) { 207 case Instruction::Br: 208 case Instruction::Ret: 209 return 10; 210 default: 211 return BaseT::getCFInstrCost(Opcode); 212 } 213 } 214 215 int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, 216 unsigned Index) { 217 switch (Opcode) { 218 case Instruction::ExtractElement: 219 case Instruction::InsertElement: 220 // Extracts are just reads of a subregister, so are free. Inserts are 221 // considered free because we don't want to have any cost for scalarizing 222 // operations, and we don't have to copy into a different register class. 223 224 // Dynamic indexing isn't free and is best avoided. 225 return Index == ~0u ? 2 : 0; 226 default: 227 return BaseT::getVectorInstrCost(Opcode, ValTy, Index); 228 } 229 } 230 231 static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII, 232 const IntrinsicInst *I) { 233 switch (I->getIntrinsicID()) { 234 default: 235 return false; 236 case Intrinsic::not_intrinsic: 237 // This means we have an intrinsic that isn't defined in 238 // IntrinsicsAMDGPU.td 239 break; 240 241 case Intrinsic::amdgcn_workitem_id_x: 242 case Intrinsic::amdgcn_workitem_id_y: 243 case Intrinsic::amdgcn_workitem_id_z: 244 case Intrinsic::amdgcn_interp_p1: 245 case Intrinsic::amdgcn_interp_p2: 246 case Intrinsic::amdgcn_mbcnt_hi: 247 case Intrinsic::amdgcn_mbcnt_lo: 248 case Intrinsic::r600_read_tidig_x: 249 case Intrinsic::r600_read_tidig_y: 250 case Intrinsic::r600_read_tidig_z: 251 case Intrinsic::amdgcn_image_atomic_swap: 252 case Intrinsic::amdgcn_image_atomic_add: 253 case Intrinsic::amdgcn_image_atomic_sub: 254 case Intrinsic::amdgcn_image_atomic_smin: 255 case Intrinsic::amdgcn_image_atomic_umin: 256 case Intrinsic::amdgcn_image_atomic_smax: 257 case Intrinsic::amdgcn_image_atomic_umax: 258 case Intrinsic::amdgcn_image_atomic_and: 259 case Intrinsic::amdgcn_image_atomic_or: 260 case Intrinsic::amdgcn_image_atomic_xor: 261 case Intrinsic::amdgcn_image_atomic_inc: 262 case Intrinsic::amdgcn_image_atomic_dec: 263 case Intrinsic::amdgcn_image_atomic_cmpswap: 264 case Intrinsic::amdgcn_buffer_atomic_swap: 265 case Intrinsic::amdgcn_buffer_atomic_add: 266 case Intrinsic::amdgcn_buffer_atomic_sub: 267 case Intrinsic::amdgcn_buffer_atomic_smin: 268 case Intrinsic::amdgcn_buffer_atomic_umin: 269 case Intrinsic::amdgcn_buffer_atomic_smax: 270 case Intrinsic::amdgcn_buffer_atomic_umax: 271 case Intrinsic::amdgcn_buffer_atomic_and: 272 case Intrinsic::amdgcn_buffer_atomic_or: 273 case Intrinsic::amdgcn_buffer_atomic_xor: 274 case Intrinsic::amdgcn_buffer_atomic_cmpswap: 275 case Intrinsic::amdgcn_ps_live: 276 return true; 277 } 278 279 StringRef Name = I->getCalledFunction()->getName(); 280 switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) { 281 default: 282 return false; 283 case AMDGPUIntrinsic::SI_fs_interp: 284 case AMDGPUIntrinsic::SI_fs_constant: 285 return true; 286 } 287 } 288 289 static bool isArgPassedInSGPR(const Argument *A) { 290 const Function *F = A->getParent(); 291 292 // Arguments to compute shaders are never a source of divergence. 293 if (!AMDGPU::isShader(F->getCallingConv())) 294 return true; 295 296 // For non-compute shaders, SGPR inputs are marked with either inreg or byval. 297 if (F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::InReg) || 298 F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::ByVal)) 299 return true; 300 301 // Everything else is in VGPRs. 302 return false; 303 } 304 305 /// 306 /// \returns true if the result of the value could potentially be 307 /// different across workitems in a wavefront. 308 bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { 309 310 if (const Argument *A = dyn_cast<Argument>(V)) 311 return !isArgPassedInSGPR(A); 312 313 // Loads from the private address space are divergent, because threads 314 // can execute the load instruction with the same inputs and get different 315 // results. 316 // 317 // All other loads are not divergent, because if threads issue loads with the 318 // same arguments, they will always get the same result. 319 if (const LoadInst *Load = dyn_cast<LoadInst>(V)) 320 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; 321 322 // Atomics are divergent because they are executed sequentially: when an 323 // atomic operation refers to the same address in each thread, then each 324 // thread after the first sees the value written by the previous thread as 325 // original value. 326 if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V)) 327 return true; 328 329 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { 330 const TargetMachine &TM = getTLI()->getTargetMachine(); 331 return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic); 332 } 333 334 // Assume all function calls are a source of divergence. 335 if (isa<CallInst>(V) || isa<InvokeInst>(V)) 336 return true; 337 338 return false; 339 } 340