1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// \file 10 /// This file provides a helper that implements much of the TTI interface in 11 /// terms of the target-independent code generator and TargetLowering 12 /// interfaces. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H 17 #define LLVM_CODEGEN_BASICTTIIMPL_H 18 19 #include "llvm/Analysis/LoopInfo.h" 20 #include "llvm/Analysis/TargetTransformInfoImpl.h" 21 #include "llvm/Support/CommandLine.h" 22 #include "llvm/Target/TargetLowering.h" 23 #include "llvm/Target/TargetSubtargetInfo.h" 24 #include "llvm/Analysis/TargetLibraryInfo.h" 25 26 namespace llvm { 27 28 extern cl::opt<unsigned> PartialUnrollingThreshold; 29 30 /// \brief Base class which can be used to help build a TTI implementation. 31 /// 32 /// This class provides as much implementation of the TTI interface as is 33 /// possible using the target independent parts of the code generator. 34 /// 35 /// In order to subclass it, your class must implement a getST() method to 36 /// return the subtarget, and a getTLI() method to return the target lowering. 37 /// We need these methods implemented in the derived class so that this class 38 /// doesn't have to duplicate storage for them. 39 template <typename T> 40 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { 41 private: 42 typedef TargetTransformInfoImplCRTPBase<T> BaseT; 43 typedef TargetTransformInfo TTI; 44 45 /// Estimate a cost of shuffle as a sequence of extract and insert 46 /// operations. 47 unsigned getPermuteShuffleOverhead(Type *Ty) { 48 assert(Ty->isVectorTy() && "Can only shuffle vectors"); 49 unsigned Cost = 0; 50 // Shuffle cost is equal to the cost of extracting element from its argument 51 // plus the cost of inserting them onto the result vector. 52 53 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from 54 // index 0 of first vector, index 1 of second vector,index 2 of first 55 // vector and finally index 3 of second vector and insert them at index 56 // <0,1,2,3> of result vector. 57 for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { 58 Cost += static_cast<T *>(this) 59 ->getVectorInstrCost(Instruction::InsertElement, Ty, i); 60 Cost += static_cast<T *>(this) 61 ->getVectorInstrCost(Instruction::ExtractElement, Ty, i); 62 } 63 return Cost; 64 } 65 66 /// \brief Local query method delegates up to T which *must* implement this! 67 const TargetSubtargetInfo *getST() const { 68 return static_cast<const T *>(this)->getST(); 69 } 70 71 /// \brief Local query method delegates up to T which *must* implement this! 72 const TargetLoweringBase *getTLI() const { 73 return static_cast<const T *>(this)->getTLI(); 74 } 75 76 protected: 77 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) 78 : BaseT(DL) {} 79 80 using TargetTransformInfoImplBase::DL; 81 82 public: 83 /// \name Scalar TTI Implementations 84 /// @{ 85 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, 86 unsigned BitWidth, unsigned AddressSpace, 87 unsigned Alignment, bool *Fast) const { 88 EVT E = EVT::getIntegerVT(Context, BitWidth); 89 return getTLI()->allowsMisalignedMemoryAccesses(E, AddressSpace, Alignment, Fast); 90 } 91 92 bool hasBranchDivergence() { return false; } 93 94 bool isSourceOfDivergence(const Value *V) { return false; } 95 96 unsigned getFlatAddressSpace() { 97 // Return an invalid address space. 98 return -1; 99 } 100 101 bool isLegalAddImmediate(int64_t imm) { 102 return getTLI()->isLegalAddImmediate(imm); 103 } 104 105 bool isLegalICmpImmediate(int64_t imm) { 106 return getTLI()->isLegalICmpImmediate(imm); 107 } 108 109 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, 110 bool HasBaseReg, int64_t Scale, 111 unsigned AddrSpace) { 112 TargetLoweringBase::AddrMode AM; 113 AM.BaseGV = BaseGV; 114 AM.BaseOffs = BaseOffset; 115 AM.HasBaseReg = HasBaseReg; 116 AM.Scale = Scale; 117 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace); 118 } 119 120 int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, 121 bool HasBaseReg, int64_t Scale, unsigned AddrSpace) { 122 TargetLoweringBase::AddrMode AM; 123 AM.BaseGV = BaseGV; 124 AM.BaseOffs = BaseOffset; 125 AM.HasBaseReg = HasBaseReg; 126 AM.Scale = Scale; 127 return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace); 128 } 129 130 bool isFoldableMemAccessOffset(Instruction *I, int64_t Offset) { 131 return getTLI()->isFoldableMemAccessOffset(I, Offset); 132 } 133 134 bool isTruncateFree(Type *Ty1, Type *Ty2) { 135 return getTLI()->isTruncateFree(Ty1, Ty2); 136 } 137 138 bool isProfitableToHoist(Instruction *I) { 139 return getTLI()->isProfitableToHoist(I); 140 } 141 142 bool isTypeLegal(Type *Ty) { 143 EVT VT = getTLI()->getValueType(DL, Ty); 144 return getTLI()->isTypeLegal(VT); 145 } 146 147 int getGEPCost(Type *PointeeType, const Value *Ptr, 148 ArrayRef<const Value *> Operands) { 149 return BaseT::getGEPCost(PointeeType, Ptr, Operands); 150 } 151 152 unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, 153 ArrayRef<const Value *> Arguments) { 154 return BaseT::getIntrinsicCost(IID, RetTy, Arguments); 155 } 156 157 unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, 158 ArrayRef<Type *> ParamTys) { 159 if (IID == Intrinsic::cttz) { 160 if (getTLI()->isCheapToSpeculateCttz()) 161 return TargetTransformInfo::TCC_Basic; 162 return TargetTransformInfo::TCC_Expensive; 163 } 164 165 if (IID == Intrinsic::ctlz) { 166 if (getTLI()->isCheapToSpeculateCtlz()) 167 return TargetTransformInfo::TCC_Basic; 168 return TargetTransformInfo::TCC_Expensive; 169 } 170 171 return BaseT::getIntrinsicCost(IID, RetTy, ParamTys); 172 } 173 174 unsigned getJumpBufAlignment() { return getTLI()->getJumpBufAlignment(); } 175 176 unsigned getJumpBufSize() { return getTLI()->getJumpBufSize(); } 177 178 bool shouldBuildLookupTables() { 179 const TargetLoweringBase *TLI = getTLI(); 180 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || 181 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other); 182 } 183 184 bool haveFastSqrt(Type *Ty) { 185 const TargetLoweringBase *TLI = getTLI(); 186 EVT VT = TLI->getValueType(DL, Ty); 187 return TLI->isTypeLegal(VT) && 188 TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); 189 } 190 191 unsigned getFPOpCost(Type *Ty) { 192 // By default, FP instructions are no more expensive since they are 193 // implemented in HW. Target specific TTI can override this. 194 return TargetTransformInfo::TCC_Basic; 195 } 196 197 unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) { 198 const TargetLoweringBase *TLI = getTLI(); 199 switch (Opcode) { 200 default: break; 201 case Instruction::Trunc: { 202 if (TLI->isTruncateFree(OpTy, Ty)) 203 return TargetTransformInfo::TCC_Free; 204 return TargetTransformInfo::TCC_Basic; 205 } 206 case Instruction::ZExt: { 207 if (TLI->isZExtFree(OpTy, Ty)) 208 return TargetTransformInfo::TCC_Free; 209 return TargetTransformInfo::TCC_Basic; 210 } 211 } 212 213 return BaseT::getOperationCost(Opcode, Ty, OpTy); 214 } 215 216 unsigned getInliningThresholdMultiplier() { return 1; } 217 218 void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP) { 219 // This unrolling functionality is target independent, but to provide some 220 // motivation for its intended use, for x86: 221 222 // According to the Intel 64 and IA-32 Architectures Optimization Reference 223 // Manual, Intel Core models and later have a loop stream detector (and 224 // associated uop queue) that can benefit from partial unrolling. 225 // The relevant requirements are: 226 // - The loop must have no more than 4 (8 for Nehalem and later) branches 227 // taken, and none of them may be calls. 228 // - The loop can have no more than 18 (28 for Nehalem and later) uops. 229 230 // According to the Software Optimization Guide for AMD Family 15h 231 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor 232 // and loop buffer which can benefit from partial unrolling. 233 // The relevant requirements are: 234 // - The loop must have fewer than 16 branches 235 // - The loop must have less than 40 uops in all executed loop branches 236 237 // The number of taken branches in a loop is hard to estimate here, and 238 // benchmarking has revealed that it is better not to be conservative when 239 // estimating the branch count. As a result, we'll ignore the branch limits 240 // until someone finds a case where it matters in practice. 241 242 unsigned MaxOps; 243 const TargetSubtargetInfo *ST = getST(); 244 if (PartialUnrollingThreshold.getNumOccurrences() > 0) 245 MaxOps = PartialUnrollingThreshold; 246 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0) 247 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize; 248 else 249 return; 250 251 // Scan the loop: don't unroll loops with calls. 252 for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; 253 ++I) { 254 BasicBlock *BB = *I; 255 256 for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J) 257 if (isa<CallInst>(J) || isa<InvokeInst>(J)) { 258 ImmutableCallSite CS(&*J); 259 if (const Function *F = CS.getCalledFunction()) { 260 if (!static_cast<T *>(this)->isLoweredToCall(F)) 261 continue; 262 } 263 264 return; 265 } 266 } 267 268 // Enable runtime and partial unrolling up to the specified size. 269 // Enable using trip count upper bound to unroll loops. 270 UP.Partial = UP.Runtime = UP.UpperBound = true; 271 UP.PartialThreshold = MaxOps; 272 273 // Avoid unrolling when optimizing for size. 274 UP.OptSizeThreshold = 0; 275 UP.PartialOptSizeThreshold = 0; 276 277 // Set number of instructions optimized when "back edge" 278 // becomes "fall through" to default value of 2. 279 UP.BEInsns = 2; 280 } 281 282 /// @} 283 284 /// \name Vector TTI Implementations 285 /// @{ 286 287 unsigned getNumberOfRegisters(bool Vector) { return Vector ? 0 : 1; } 288 289 unsigned getRegisterBitWidth(bool Vector) { return 32; } 290 291 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 292 /// are set if the result needs to be inserted and/or extracted from vectors. 293 unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { 294 assert(Ty->isVectorTy() && "Can only scalarize vectors"); 295 unsigned Cost = 0; 296 297 for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { 298 if (Insert) 299 Cost += static_cast<T *>(this) 300 ->getVectorInstrCost(Instruction::InsertElement, Ty, i); 301 if (Extract) 302 Cost += static_cast<T *>(this) 303 ->getVectorInstrCost(Instruction::ExtractElement, Ty, i); 304 } 305 306 return Cost; 307 } 308 309 /// Estimate the overhead of scalarizing an instructions unique 310 /// non-constant operands. The types of the arguments are ordinarily 311 /// scalar, in which case the costs are multiplied with VF. 312 unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, 313 unsigned VF) { 314 unsigned Cost = 0; 315 SmallPtrSet<const Value*, 4> UniqueOperands; 316 for (const Value *A : Args) { 317 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) { 318 Type *VecTy = nullptr; 319 if (A->getType()->isVectorTy()) { 320 VecTy = A->getType(); 321 // If A is a vector operand, VF should be 1 or correspond to A. 322 assert ((VF == 1 || VF == VecTy->getVectorNumElements()) && 323 "Vector argument does not match VF"); 324 } 325 else 326 VecTy = VectorType::get(A->getType(), VF); 327 328 Cost += getScalarizationOverhead(VecTy, false, true); 329 } 330 } 331 332 return Cost; 333 } 334 335 unsigned getScalarizationOverhead(Type *VecTy, ArrayRef<const Value *> Args) { 336 assert (VecTy->isVectorTy()); 337 338 unsigned Cost = 0; 339 340 Cost += getScalarizationOverhead(VecTy, true, false); 341 if (!Args.empty()) 342 Cost += getOperandsScalarizationOverhead(Args, 343 VecTy->getVectorNumElements()); 344 else 345 // When no information on arguments is provided, we add the cost 346 // associated with one argument as a heuristic. 347 Cost += getScalarizationOverhead(VecTy, false, true); 348 349 return Cost; 350 } 351 352 unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } 353 354 unsigned getArithmeticInstrCost( 355 unsigned Opcode, Type *Ty, 356 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, 357 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, 358 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, 359 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, 360 ArrayRef<const Value *> Args = ArrayRef<const Value *>()) { 361 // Check if any of the operands are vector operands. 362 const TargetLoweringBase *TLI = getTLI(); 363 int ISD = TLI->InstructionOpcodeToISD(Opcode); 364 assert(ISD && "Invalid opcode"); 365 366 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 367 368 bool IsFloat = Ty->getScalarType()->isFloatingPointTy(); 369 // Assume that floating point arithmetic operations cost twice as much as 370 // integer operations. 371 unsigned OpCost = (IsFloat ? 2 : 1); 372 373 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 374 // The operation is legal. Assume it costs 1. 375 // TODO: Once we have extract/insert subvector cost we need to use them. 376 return LT.first * OpCost; 377 } 378 379 if (!TLI->isOperationExpand(ISD, LT.second)) { 380 // If the operation is custom lowered, then assume that the code is twice 381 // as expensive. 382 return LT.first * 2 * OpCost; 383 } 384 385 // Else, assume that we need to scalarize this op. 386 // TODO: If one of the types get legalized by splitting, handle this 387 // similarly to what getCastInstrCost() does. 388 if (Ty->isVectorTy()) { 389 unsigned Num = Ty->getVectorNumElements(); 390 unsigned Cost = static_cast<T *>(this) 391 ->getArithmeticInstrCost(Opcode, Ty->getScalarType()); 392 // Return the cost of multiple scalar invocation plus the cost of 393 // inserting and extracting the values. 394 return getScalarizationOverhead(Ty, Args) + Num * Cost; 395 } 396 397 // We don't know anything about this scalar instruction. 398 return OpCost; 399 } 400 401 unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, 402 Type *SubTp) { 403 if (Kind == TTI::SK_Alternate || Kind == TTI::SK_PermuteTwoSrc || 404 Kind == TTI::SK_PermuteSingleSrc) { 405 return getPermuteShuffleOverhead(Tp); 406 } 407 return 1; 408 } 409 410 unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, 411 const Instruction *I = nullptr) { 412 const TargetLoweringBase *TLI = getTLI(); 413 int ISD = TLI->InstructionOpcodeToISD(Opcode); 414 assert(ISD && "Invalid opcode"); 415 std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, Src); 416 std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(DL, Dst); 417 418 // Check for NOOP conversions. 419 if (SrcLT.first == DstLT.first && 420 SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) { 421 422 // Bitcast between types that are legalized to the same type are free. 423 if (Opcode == Instruction::BitCast || Opcode == Instruction::Trunc) 424 return 0; 425 } 426 427 if (Opcode == Instruction::Trunc && 428 TLI->isTruncateFree(SrcLT.second, DstLT.second)) 429 return 0; 430 431 if (Opcode == Instruction::ZExt && 432 TLI->isZExtFree(SrcLT.second, DstLT.second)) 433 return 0; 434 435 if (Opcode == Instruction::AddrSpaceCast && 436 TLI->isNoopAddrSpaceCast(Src->getPointerAddressSpace(), 437 Dst->getPointerAddressSpace())) 438 return 0; 439 440 // If this is a zext/sext of a load, return 0 if the corresponding 441 // extending load exists on target. 442 if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) && 443 I && isa<LoadInst>(I->getOperand(0))) { 444 EVT ExtVT = EVT::getEVT(Dst); 445 EVT LoadVT = EVT::getEVT(Src); 446 unsigned LType = 447 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD); 448 if (TLI->isLoadExtLegal(LType, ExtVT, LoadVT)) 449 return 0; 450 } 451 452 // If the cast is marked as legal (or promote) then assume low cost. 453 if (SrcLT.first == DstLT.first && 454 TLI->isOperationLegalOrPromote(ISD, DstLT.second)) 455 return 1; 456 457 // Handle scalar conversions. 458 if (!Src->isVectorTy() && !Dst->isVectorTy()) { 459 460 // Scalar bitcasts are usually free. 461 if (Opcode == Instruction::BitCast) 462 return 0; 463 464 // Just check the op cost. If the operation is legal then assume it costs 465 // 1. 466 if (!TLI->isOperationExpand(ISD, DstLT.second)) 467 return 1; 468 469 // Assume that illegal scalar instruction are expensive. 470 return 4; 471 } 472 473 // Check vector-to-vector casts. 474 if (Dst->isVectorTy() && Src->isVectorTy()) { 475 476 // If the cast is between same-sized registers, then the check is simple. 477 if (SrcLT.first == DstLT.first && 478 SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) { 479 480 // Assume that Zext is done using AND. 481 if (Opcode == Instruction::ZExt) 482 return 1; 483 484 // Assume that sext is done using SHL and SRA. 485 if (Opcode == Instruction::SExt) 486 return 2; 487 488 // Just check the op cost. If the operation is legal then assume it 489 // costs 490 // 1 and multiply by the type-legalization overhead. 491 if (!TLI->isOperationExpand(ISD, DstLT.second)) 492 return SrcLT.first * 1; 493 } 494 495 // If we are legalizing by splitting, query the concrete TTI for the cost 496 // of casting the original vector twice. We also need to factor int the 497 // cost of the split itself. Count that as 1, to be consistent with 498 // TLI->getTypeLegalizationCost(). 499 if ((TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) == 500 TargetLowering::TypeSplitVector) || 501 (TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) == 502 TargetLowering::TypeSplitVector)) { 503 Type *SplitDst = VectorType::get(Dst->getVectorElementType(), 504 Dst->getVectorNumElements() / 2); 505 Type *SplitSrc = VectorType::get(Src->getVectorElementType(), 506 Src->getVectorNumElements() / 2); 507 T *TTI = static_cast<T *>(this); 508 return TTI->getVectorSplitCost() + 509 (2 * TTI->getCastInstrCost(Opcode, SplitDst, SplitSrc, I)); 510 } 511 512 // In other cases where the source or destination are illegal, assume 513 // the operation will get scalarized. 514 unsigned Num = Dst->getVectorNumElements(); 515 unsigned Cost = static_cast<T *>(this)->getCastInstrCost( 516 Opcode, Dst->getScalarType(), Src->getScalarType(), I); 517 518 // Return the cost of multiple scalar invocation plus the cost of 519 // inserting and extracting the values. 520 return getScalarizationOverhead(Dst, true, true) + Num * Cost; 521 } 522 523 // We already handled vector-to-vector and scalar-to-scalar conversions. 524 // This 525 // is where we handle bitcast between vectors and scalars. We need to assume 526 // that the conversion is scalarized in one way or another. 527 if (Opcode == Instruction::BitCast) 528 // Illegal bitcasts are done by storing and loading from a stack slot. 529 return (Src->isVectorTy() ? getScalarizationOverhead(Src, false, true) 530 : 0) + 531 (Dst->isVectorTy() ? getScalarizationOverhead(Dst, true, false) 532 : 0); 533 534 llvm_unreachable("Unhandled cast"); 535 } 536 537 unsigned getExtractWithExtendCost(unsigned Opcode, Type *Dst, 538 VectorType *VecTy, unsigned Index) { 539 return static_cast<T *>(this)->getVectorInstrCost( 540 Instruction::ExtractElement, VecTy, Index) + 541 static_cast<T *>(this)->getCastInstrCost(Opcode, Dst, 542 VecTy->getElementType()); 543 } 544 545 unsigned getCFInstrCost(unsigned Opcode) { 546 // Branches are assumed to be predicted. 547 return 0; 548 } 549 550 unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, 551 const Instruction *I) { 552 const TargetLoweringBase *TLI = getTLI(); 553 int ISD = TLI->InstructionOpcodeToISD(Opcode); 554 assert(ISD && "Invalid opcode"); 555 556 // Selects on vectors are actually vector selects. 557 if (ISD == ISD::SELECT) { 558 assert(CondTy && "CondTy must exist"); 559 if (CondTy->isVectorTy()) 560 ISD = ISD::VSELECT; 561 } 562 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 563 564 if (!(ValTy->isVectorTy() && !LT.second.isVector()) && 565 !TLI->isOperationExpand(ISD, LT.second)) { 566 // The operation is legal. Assume it costs 1. Multiply 567 // by the type-legalization overhead. 568 return LT.first * 1; 569 } 570 571 // Otherwise, assume that the cast is scalarized. 572 // TODO: If one of the types get legalized by splitting, handle this 573 // similarly to what getCastInstrCost() does. 574 if (ValTy->isVectorTy()) { 575 unsigned Num = ValTy->getVectorNumElements(); 576 if (CondTy) 577 CondTy = CondTy->getScalarType(); 578 unsigned Cost = static_cast<T *>(this)->getCmpSelInstrCost( 579 Opcode, ValTy->getScalarType(), CondTy, I); 580 581 // Return the cost of multiple scalar invocation plus the cost of 582 // inserting and extracting the values. 583 return getScalarizationOverhead(ValTy, true, false) + Num * Cost; 584 } 585 586 // Unknown scalar opcode. 587 return 1; 588 } 589 590 unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { 591 std::pair<unsigned, MVT> LT = 592 getTLI()->getTypeLegalizationCost(DL, Val->getScalarType()); 593 594 return LT.first; 595 } 596 597 unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 598 unsigned AddressSpace, const Instruction *I = nullptr) { 599 assert(!Src->isVoidTy() && "Invalid type"); 600 std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Src); 601 602 // Assuming that all loads of legal types cost 1. 603 unsigned Cost = LT.first; 604 605 if (Src->isVectorTy() && 606 Src->getPrimitiveSizeInBits() < LT.second.getSizeInBits()) { 607 // This is a vector load that legalizes to a larger type than the vector 608 // itself. Unless the corresponding extending load or truncating store is 609 // legal, then this will scalarize. 610 TargetLowering::LegalizeAction LA = TargetLowering::Expand; 611 EVT MemVT = getTLI()->getValueType(DL, Src); 612 if (Opcode == Instruction::Store) 613 LA = getTLI()->getTruncStoreAction(LT.second, MemVT); 614 else 615 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT); 616 617 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { 618 // This is a vector load/store for some illegal type that is scalarized. 619 // We must account for the cost of building or decomposing the vector. 620 Cost += getScalarizationOverhead(Src, Opcode != Instruction::Store, 621 Opcode == Instruction::Store); 622 } 623 } 624 625 return Cost; 626 } 627 628 unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, 629 unsigned Factor, 630 ArrayRef<unsigned> Indices, 631 unsigned Alignment, 632 unsigned AddressSpace) { 633 VectorType *VT = dyn_cast<VectorType>(VecTy); 634 assert(VT && "Expect a vector type for interleaved memory op"); 635 636 unsigned NumElts = VT->getNumElements(); 637 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"); 638 639 unsigned NumSubElts = NumElts / Factor; 640 VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts); 641 642 // Firstly, the cost of load/store operation. 643 unsigned Cost = static_cast<T *>(this)->getMemoryOpCost( 644 Opcode, VecTy, Alignment, AddressSpace); 645 646 // Legalize the vector type, and get the legalized and unlegalized type 647 // sizes. 648 MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; 649 unsigned VecTySize = 650 static_cast<T *>(this)->getDataLayout().getTypeStoreSize(VecTy); 651 unsigned VecTyLTSize = VecTyLT.getStoreSize(); 652 653 // Return the ceiling of dividing A by B. 654 auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; }; 655 656 // Scale the cost of the memory operation by the fraction of legalized 657 // instructions that will actually be used. We shouldn't account for the 658 // cost of dead instructions since they will be removed. 659 // 660 // E.g., An interleaved load of factor 8: 661 // %vec = load <16 x i64>, <16 x i64>* %ptr 662 // %v0 = shufflevector %vec, undef, <0, 8> 663 // 664 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be 665 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized 666 // type). The other loads are unused. 667 // 668 // We only scale the cost of loads since interleaved store groups aren't 669 // allowed to have gaps. 670 if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) { 671 672 // The number of loads of a legal type it will take to represent a load 673 // of the unlegalized vector type. 674 unsigned NumLegalInsts = ceil(VecTySize, VecTyLTSize); 675 676 // The number of elements of the unlegalized type that correspond to a 677 // single legal instruction. 678 unsigned NumEltsPerLegalInst = ceil(NumElts, NumLegalInsts); 679 680 // Determine which legal instructions will be used. 681 BitVector UsedInsts(NumLegalInsts, false); 682 for (unsigned Index : Indices) 683 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt) 684 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst); 685 686 // Scale the cost of the load by the fraction of legal instructions that 687 // will be used. 688 Cost *= UsedInsts.count() / NumLegalInsts; 689 } 690 691 // Then plus the cost of interleave operation. 692 if (Opcode == Instruction::Load) { 693 // The interleave cost is similar to extract sub vectors' elements 694 // from the wide vector, and insert them into sub vectors. 695 // 696 // E.g. An interleaved load of factor 2 (with one member of index 0): 697 // %vec = load <8 x i32>, <8 x i32>* %ptr 698 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 699 // The cost is estimated as extract elements at 0, 2, 4, 6 from the 700 // <8 x i32> vector and insert them into a <4 x i32> vector. 701 702 assert(Indices.size() <= Factor && 703 "Interleaved memory op has too many members"); 704 705 for (unsigned Index : Indices) { 706 assert(Index < Factor && "Invalid index for interleaved memory op"); 707 708 // Extract elements from loaded vector for each sub vector. 709 for (unsigned i = 0; i < NumSubElts; i++) 710 Cost += static_cast<T *>(this)->getVectorInstrCost( 711 Instruction::ExtractElement, VT, Index + i * Factor); 712 } 713 714 unsigned InsSubCost = 0; 715 for (unsigned i = 0; i < NumSubElts; i++) 716 InsSubCost += static_cast<T *>(this)->getVectorInstrCost( 717 Instruction::InsertElement, SubVT, i); 718 719 Cost += Indices.size() * InsSubCost; 720 } else { 721 // The interleave cost is extract all elements from sub vectors, and 722 // insert them into the wide vector. 723 // 724 // E.g. An interleaved store of factor 2: 725 // %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7> 726 // store <8 x i32> %interleaved.vec, <8 x i32>* %ptr 727 // The cost is estimated as extract all elements from both <4 x i32> 728 // vectors and insert into the <8 x i32> vector. 729 730 unsigned ExtSubCost = 0; 731 for (unsigned i = 0; i < NumSubElts; i++) 732 ExtSubCost += static_cast<T *>(this)->getVectorInstrCost( 733 Instruction::ExtractElement, SubVT, i); 734 Cost += ExtSubCost * Factor; 735 736 for (unsigned i = 0; i < NumElts; i++) 737 Cost += static_cast<T *>(this) 738 ->getVectorInstrCost(Instruction::InsertElement, VT, i); 739 } 740 741 return Cost; 742 } 743 744 /// Get intrinsic cost based on arguments. 745 unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, 746 ArrayRef<Value *> Args, FastMathFlags FMF, 747 unsigned VF = 1) { 748 unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1); 749 assert ((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type"); 750 751 switch (IID) { 752 default: { 753 // Assume that we need to scalarize this intrinsic. 754 SmallVector<Type *, 4> Types; 755 for (Value *Op : Args) { 756 Type *OpTy = Op->getType(); 757 assert (VF == 1 || !OpTy->isVectorTy()); 758 Types.push_back(VF == 1 ? OpTy : VectorType::get(OpTy, VF)); 759 } 760 761 if (VF > 1 && !RetTy->isVoidTy()) 762 RetTy = VectorType::get(RetTy, VF); 763 764 // Compute the scalarization overhead based on Args for a vector 765 // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while 766 // CostModel will pass a vector RetTy and VF is 1. 767 unsigned ScalarizationCost = UINT_MAX; 768 if (RetVF > 1 || VF > 1) { 769 ScalarizationCost = 0; 770 if (!RetTy->isVoidTy()) 771 ScalarizationCost += getScalarizationOverhead(RetTy, true, false); 772 ScalarizationCost += getOperandsScalarizationOverhead(Args, VF); 773 } 774 775 return static_cast<T *>(this)-> 776 getIntrinsicInstrCost(IID, RetTy, Types, FMF, ScalarizationCost); 777 } 778 case Intrinsic::masked_scatter: { 779 assert (VF == 1 && "Can't vectorize types here."); 780 Value *Mask = Args[3]; 781 bool VarMask = !isa<Constant>(Mask); 782 unsigned Alignment = cast<ConstantInt>(Args[2])->getZExtValue(); 783 return 784 static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Store, 785 Args[0]->getType(), 786 Args[1], VarMask, 787 Alignment); 788 } 789 case Intrinsic::masked_gather: { 790 assert (VF == 1 && "Can't vectorize types here."); 791 Value *Mask = Args[2]; 792 bool VarMask = !isa<Constant>(Mask); 793 unsigned Alignment = cast<ConstantInt>(Args[1])->getZExtValue(); 794 return 795 static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Load, 796 RetTy, Args[0], VarMask, 797 Alignment); 798 } 799 } 800 } 801 802 /// Get intrinsic cost based on argument types. 803 /// If ScalarizationCostPassed is UINT_MAX, the cost of scalarizing the 804 /// arguments and the return value will be computed based on types. 805 unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, 806 ArrayRef<Type *> Tys, FastMathFlags FMF, 807 unsigned ScalarizationCostPassed = UINT_MAX) { 808 SmallVector<unsigned, 2> ISDs; 809 unsigned SingleCallCost = 10; // Library call cost. Make it expensive. 810 switch (IID) { 811 default: { 812 // Assume that we need to scalarize this intrinsic. 813 unsigned ScalarizationCost = ScalarizationCostPassed; 814 unsigned ScalarCalls = 1; 815 Type *ScalarRetTy = RetTy; 816 if (RetTy->isVectorTy()) { 817 if (ScalarizationCostPassed == UINT_MAX) 818 ScalarizationCost = getScalarizationOverhead(RetTy, true, false); 819 ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements()); 820 ScalarRetTy = RetTy->getScalarType(); 821 } 822 SmallVector<Type *, 4> ScalarTys; 823 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 824 Type *Ty = Tys[i]; 825 if (Ty->isVectorTy()) { 826 if (ScalarizationCostPassed == UINT_MAX) 827 ScalarizationCost += getScalarizationOverhead(Ty, false, true); 828 ScalarCalls = std::max(ScalarCalls, Ty->getVectorNumElements()); 829 Ty = Ty->getScalarType(); 830 } 831 ScalarTys.push_back(Ty); 832 } 833 if (ScalarCalls == 1) 834 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. 835 836 unsigned ScalarCost = static_cast<T *>(this)->getIntrinsicInstrCost( 837 IID, ScalarRetTy, ScalarTys, FMF); 838 839 return ScalarCalls * ScalarCost + ScalarizationCost; 840 } 841 // Look for intrinsics that can be lowered directly or turned into a scalar 842 // intrinsic call. 843 case Intrinsic::sqrt: 844 ISDs.push_back(ISD::FSQRT); 845 break; 846 case Intrinsic::sin: 847 ISDs.push_back(ISD::FSIN); 848 break; 849 case Intrinsic::cos: 850 ISDs.push_back(ISD::FCOS); 851 break; 852 case Intrinsic::exp: 853 ISDs.push_back(ISD::FEXP); 854 break; 855 case Intrinsic::exp2: 856 ISDs.push_back(ISD::FEXP2); 857 break; 858 case Intrinsic::log: 859 ISDs.push_back(ISD::FLOG); 860 break; 861 case Intrinsic::log10: 862 ISDs.push_back(ISD::FLOG10); 863 break; 864 case Intrinsic::log2: 865 ISDs.push_back(ISD::FLOG2); 866 break; 867 case Intrinsic::fabs: 868 ISDs.push_back(ISD::FABS); 869 break; 870 case Intrinsic::minnum: 871 ISDs.push_back(ISD::FMINNUM); 872 if (FMF.noNaNs()) 873 ISDs.push_back(ISD::FMINNAN); 874 break; 875 case Intrinsic::maxnum: 876 ISDs.push_back(ISD::FMAXNUM); 877 if (FMF.noNaNs()) 878 ISDs.push_back(ISD::FMAXNAN); 879 break; 880 case Intrinsic::copysign: 881 ISDs.push_back(ISD::FCOPYSIGN); 882 break; 883 case Intrinsic::floor: 884 ISDs.push_back(ISD::FFLOOR); 885 break; 886 case Intrinsic::ceil: 887 ISDs.push_back(ISD::FCEIL); 888 break; 889 case Intrinsic::trunc: 890 ISDs.push_back(ISD::FTRUNC); 891 break; 892 case Intrinsic::nearbyint: 893 ISDs.push_back(ISD::FNEARBYINT); 894 break; 895 case Intrinsic::rint: 896 ISDs.push_back(ISD::FRINT); 897 break; 898 case Intrinsic::round: 899 ISDs.push_back(ISD::FROUND); 900 break; 901 case Intrinsic::pow: 902 ISDs.push_back(ISD::FPOW); 903 break; 904 case Intrinsic::fma: 905 ISDs.push_back(ISD::FMA); 906 break; 907 case Intrinsic::fmuladd: 908 ISDs.push_back(ISD::FMA); 909 break; 910 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. 911 case Intrinsic::lifetime_start: 912 case Intrinsic::lifetime_end: 913 return 0; 914 case Intrinsic::masked_store: 915 return static_cast<T *>(this) 916 ->getMaskedMemoryOpCost(Instruction::Store, Tys[0], 0, 0); 917 case Intrinsic::masked_load: 918 return static_cast<T *>(this) 919 ->getMaskedMemoryOpCost(Instruction::Load, RetTy, 0, 0); 920 case Intrinsic::ctpop: 921 ISDs.push_back(ISD::CTPOP); 922 // In case of legalization use TCC_Expensive. This is cheaper than a 923 // library call but still not a cheap instruction. 924 SingleCallCost = TargetTransformInfo::TCC_Expensive; 925 break; 926 // FIXME: ctlz, cttz, ... 927 } 928 929 const TargetLoweringBase *TLI = getTLI(); 930 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy); 931 932 SmallVector<unsigned, 2> LegalCost; 933 SmallVector<unsigned, 2> CustomCost; 934 for (unsigned ISD : ISDs) { 935 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 936 if (IID == Intrinsic::fabs && TLI->isFAbsFree(LT.second)) { 937 return 0; 938 } 939 940 // The operation is legal. Assume it costs 1. 941 // If the type is split to multiple registers, assume that there is some 942 // overhead to this. 943 // TODO: Once we have extract/insert subvector cost we need to use them. 944 if (LT.first > 1) 945 LegalCost.push_back(LT.first * 2); 946 else 947 LegalCost.push_back(LT.first * 1); 948 } else if (!TLI->isOperationExpand(ISD, LT.second)) { 949 // If the operation is custom lowered then assume 950 // that the code is twice as expensive. 951 CustomCost.push_back(LT.first * 2); 952 } 953 } 954 955 auto MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end()); 956 if (MinLegalCostI != LegalCost.end()) 957 return *MinLegalCostI; 958 959 auto MinCustomCostI = std::min_element(CustomCost.begin(), CustomCost.end()); 960 if (MinCustomCostI != CustomCost.end()) 961 return *MinCustomCostI; 962 963 // If we can't lower fmuladd into an FMA estimate the cost as a floating 964 // point mul followed by an add. 965 if (IID == Intrinsic::fmuladd) 966 return static_cast<T *>(this) 967 ->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) + 968 static_cast<T *>(this) 969 ->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy); 970 971 // Else, assume that we need to scalarize this intrinsic. For math builtins 972 // this will emit a costly libcall, adding call overhead and spills. Make it 973 // very expensive. 974 if (RetTy->isVectorTy()) { 975 unsigned ScalarizationCost = ((ScalarizationCostPassed != UINT_MAX) ? 976 ScalarizationCostPassed : getScalarizationOverhead(RetTy, true, false)); 977 unsigned ScalarCalls = RetTy->getVectorNumElements(); 978 SmallVector<Type *, 4> ScalarTys; 979 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 980 Type *Ty = Tys[i]; 981 if (Ty->isVectorTy()) 982 Ty = Ty->getScalarType(); 983 ScalarTys.push_back(Ty); 984 } 985 unsigned ScalarCost = static_cast<T *>(this)->getIntrinsicInstrCost( 986 IID, RetTy->getScalarType(), ScalarTys, FMF); 987 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 988 if (Tys[i]->isVectorTy()) { 989 if (ScalarizationCostPassed == UINT_MAX) 990 ScalarizationCost += getScalarizationOverhead(Tys[i], false, true); 991 ScalarCalls = std::max(ScalarCalls, Tys[i]->getVectorNumElements()); 992 } 993 } 994 995 return ScalarCalls * ScalarCost + ScalarizationCost; 996 } 997 998 // This is going to be turned into a library call, make it expensive. 999 return SingleCallCost; 1000 } 1001 1002 /// \brief Compute a cost of the given call instruction. 1003 /// 1004 /// Compute the cost of calling function F with return type RetTy and 1005 /// argument types Tys. F might be nullptr, in this case the cost of an 1006 /// arbitrary call with the specified signature will be returned. 1007 /// This is used, for instance, when we estimate call of a vector 1008 /// counterpart of the given function. 1009 /// \param F Called function, might be nullptr. 1010 /// \param RetTy Return value types. 1011 /// \param Tys Argument types. 1012 /// \returns The cost of Call instruction. 1013 unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) { 1014 return 10; 1015 } 1016 1017 unsigned getNumberOfParts(Type *Tp) { 1018 std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Tp); 1019 return LT.first; 1020 } 1021 1022 unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, 1023 const SCEV *) { 1024 return 0; 1025 } 1026 1027 unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise) { 1028 assert(Ty->isVectorTy() && "Expect a vector type"); 1029 Type *ScalarTy = Ty->getVectorElementType(); 1030 unsigned NumVecElts = Ty->getVectorNumElements(); 1031 unsigned NumReduxLevels = Log2_32(NumVecElts); 1032 // Try to calculate arithmetic and shuffle op costs for reduction operations. 1033 // We're assuming that reduction operation are performing the following way: 1034 // 1. Non-pairwise reduction 1035 // %val1 = shufflevector<n x t> %val, <n x t> %undef, 1036 // <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef> 1037 // \----------------v-------------/ \----------v------------/ 1038 // n/2 elements n/2 elements 1039 // %red1 = op <n x t> %val, <n x t> val1 1040 // After this operation we have a vector %red1 with only maningfull the 1041 // first n/2 elements, the second n/2 elements are undefined and can be 1042 // dropped. All other operations are actually working with the vector of 1043 // length n/2, not n. though the real vector length is still n. 1044 // %val2 = shufflevector<n x t> %red1, <n x t> %undef, 1045 // <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef> 1046 // \----------------v-------------/ \----------v------------/ 1047 // n/4 elements 3*n/4 elements 1048 // %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of 1049 // length n/2, the resulting vector has length n/4 etc. 1050 // 2. Pairwise reduction: 1051 // Everything is the same except for an additional shuffle operation which 1052 // is used to produce operands for pairwise kind of reductions. 1053 // %val1 = shufflevector<n x t> %val, <n x t> %undef, 1054 // <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef> 1055 // \-------------v----------/ \----------v------------/ 1056 // n/2 elements n/2 elements 1057 // %val2 = shufflevector<n x t> %val, <n x t> %undef, 1058 // <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef> 1059 // \-------------v----------/ \----------v------------/ 1060 // n/2 elements n/2 elements 1061 // %red1 = op <n x t> %val1, <n x t> val2 1062 // Again, the operation is performed on <n x t> vector, but the resulting 1063 // vector %red1 is <n/2 x t> vector. 1064 // 1065 // The cost model should take into account that the actual length of the 1066 // vector is reduced on each iteration. 1067 unsigned ArithCost = 0; 1068 unsigned ShuffleCost = 0; 1069 auto *ConcreteTTI = static_cast<T *>(this); 1070 std::pair<unsigned, MVT> LT = 1071 ConcreteTTI->getTLI()->getTypeLegalizationCost(DL, Ty); 1072 unsigned LongVectorCount = 0; 1073 unsigned MVTLen = 1074 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 1075 while (NumVecElts > MVTLen) { 1076 NumVecElts /= 2; 1077 // Assume the pairwise shuffles add a cost. 1078 ShuffleCost += (IsPairwise + 1) * 1079 ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty, 1080 NumVecElts, Ty); 1081 ArithCost += ConcreteTTI->getArithmeticInstrCost(Opcode, Ty); 1082 Ty = VectorType::get(ScalarTy, NumVecElts); 1083 ++LongVectorCount; 1084 } 1085 // The minimal length of the vector is limited by the real length of vector 1086 // operations performed on the current platform. That's why several final 1087 // reduction opertions are perfomed on the vectors with the same 1088 // architecture-dependent length. 1089 ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) * 1090 ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty, 1091 NumVecElts, Ty); 1092 ArithCost += (NumReduxLevels - LongVectorCount) * 1093 ConcreteTTI->getArithmeticInstrCost(Opcode, Ty); 1094 return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true); 1095 } 1096 1097 unsigned getVectorSplitCost() { return 1; } 1098 1099 /// @} 1100 }; 1101 1102 /// \brief Concrete BasicTTIImpl that can be used if no further customization 1103 /// is needed. 1104 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> { 1105 typedef BasicTTIImplBase<BasicTTIImpl> BaseT; 1106 friend class BasicTTIImplBase<BasicTTIImpl>; 1107 1108 const TargetSubtargetInfo *ST; 1109 const TargetLoweringBase *TLI; 1110 1111 const TargetSubtargetInfo *getST() const { return ST; } 1112 const TargetLoweringBase *getTLI() const { return TLI; } 1113 1114 public: 1115 explicit BasicTTIImpl(const TargetMachine *ST, const Function &F); 1116 }; 1117 1118 } 1119 1120 #endif 1121