1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// This file provides a helper that implements much of the TTI interface in 12 /// terms of the target-independent code generator and TargetLowering 13 /// interfaces. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H 18 #define LLVM_CODEGEN_BASICTTIIMPL_H 19 20 #include "llvm/ADT/APInt.h" 21 #include "llvm/ADT/ArrayRef.h" 22 #include "llvm/ADT/BitVector.h" 23 #include "llvm/ADT/SmallPtrSet.h" 24 #include "llvm/ADT/SmallVector.h" 25 #include "llvm/Analysis/LoopInfo.h" 26 #include "llvm/Analysis/TargetTransformInfo.h" 27 #include "llvm/Analysis/TargetTransformInfoImpl.h" 28 #include "llvm/CodeGen/ISDOpcodes.h" 29 #include "llvm/CodeGen/MachineValueType.h" 30 #include "llvm/CodeGen/ValueTypes.h" 31 #include "llvm/IR/BasicBlock.h" 32 #include "llvm/IR/CallSite.h" 33 #include "llvm/IR/Constant.h" 34 #include "llvm/IR/Constants.h" 35 #include "llvm/IR/DataLayout.h" 36 #include "llvm/IR/DerivedTypes.h" 37 #include "llvm/IR/InstrTypes.h" 38 #include "llvm/IR/Instruction.h" 39 #include "llvm/IR/Instructions.h" 40 #include "llvm/IR/Intrinsics.h" 41 #include "llvm/IR/Operator.h" 42 #include "llvm/IR/Type.h" 43 #include "llvm/IR/Value.h" 44 #include "llvm/MC/MCSchedule.h" 45 #include "llvm/Support/Casting.h" 46 #include "llvm/Support/CommandLine.h" 47 #include "llvm/Support/ErrorHandling.h" 48 #include "llvm/Support/MathExtras.h" 49 #include "llvm/Target/TargetLowering.h" 50 #include "llvm/Target/TargetSubtargetInfo.h" 51 #include <algorithm> 52 #include <cassert> 53 #include <cstdint> 54 #include <limits> 55 #include <utility> 56 57 namespace llvm { 58 59 class Function; 60 class GlobalValue; 61 class LLVMContext; 62 class ScalarEvolution; 63 class SCEV; 64 class TargetMachine; 65 66 extern cl::opt<unsigned> PartialUnrollingThreshold; 67 68 /// \brief Base class which can be used to help build a TTI implementation. 69 /// 70 /// This class provides as much implementation of the TTI interface as is 71 /// possible using the target independent parts of the code generator. 72 /// 73 /// In order to subclass it, your class must implement a getST() method to 74 /// return the subtarget, and a getTLI() method to return the target lowering. 75 /// We need these methods implemented in the derived class so that this class 76 /// doesn't have to duplicate storage for them. 77 template <typename T> 78 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { 79 private: 80 using BaseT = TargetTransformInfoImplCRTPBase<T>; 81 using TTI = TargetTransformInfo; 82 83 /// Estimate a cost of shuffle as a sequence of extract and insert 84 /// operations. 85 unsigned getPermuteShuffleOverhead(Type *Ty) { 86 assert(Ty->isVectorTy() && "Can only shuffle vectors"); 87 unsigned Cost = 0; 88 // Shuffle cost is equal to the cost of extracting element from its argument 89 // plus the cost of inserting them onto the result vector. 90 91 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from 92 // index 0 of first vector, index 1 of second vector,index 2 of first 93 // vector and finally index 3 of second vector and insert them at index 94 // <0,1,2,3> of result vector. 95 for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { 96 Cost += static_cast<T *>(this) 97 ->getVectorInstrCost(Instruction::InsertElement, Ty, i); 98 Cost += static_cast<T *>(this) 99 ->getVectorInstrCost(Instruction::ExtractElement, Ty, i); 100 } 101 return Cost; 102 } 103 104 /// \brief Local query method delegates up to T which *must* implement this! 105 const TargetSubtargetInfo *getST() const { 106 return static_cast<const T *>(this)->getST(); 107 } 108 109 /// \brief Local query method delegates up to T which *must* implement this! 110 const TargetLoweringBase *getTLI() const { 111 return static_cast<const T *>(this)->getTLI(); 112 } 113 114 protected: 115 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) 116 : BaseT(DL) {} 117 118 using TargetTransformInfoImplBase::DL; 119 120 public: 121 /// \name Scalar TTI Implementations 122 /// @{ 123 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, 124 unsigned BitWidth, unsigned AddressSpace, 125 unsigned Alignment, bool *Fast) const { 126 EVT E = EVT::getIntegerVT(Context, BitWidth); 127 return getTLI()->allowsMisalignedMemoryAccesses(E, AddressSpace, Alignment, Fast); 128 } 129 130 bool hasBranchDivergence() { return false; } 131 132 bool isSourceOfDivergence(const Value *V) { return false; } 133 134 bool isAlwaysUniform(const Value *V) { return false; } 135 136 unsigned getFlatAddressSpace() { 137 // Return an invalid address space. 138 return -1; 139 } 140 141 bool isLegalAddImmediate(int64_t imm) { 142 return getTLI()->isLegalAddImmediate(imm); 143 } 144 145 bool isLegalICmpImmediate(int64_t imm) { 146 return getTLI()->isLegalICmpImmediate(imm); 147 } 148 149 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, 150 bool HasBaseReg, int64_t Scale, 151 unsigned AddrSpace, Instruction *I = nullptr) { 152 TargetLoweringBase::AddrMode AM; 153 AM.BaseGV = BaseGV; 154 AM.BaseOffs = BaseOffset; 155 AM.HasBaseReg = HasBaseReg; 156 AM.Scale = Scale; 157 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I); 158 } 159 160 bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) { 161 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); 162 } 163 164 int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, 165 bool HasBaseReg, int64_t Scale, unsigned AddrSpace) { 166 TargetLoweringBase::AddrMode AM; 167 AM.BaseGV = BaseGV; 168 AM.BaseOffs = BaseOffset; 169 AM.HasBaseReg = HasBaseReg; 170 AM.Scale = Scale; 171 return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace); 172 } 173 174 bool isTruncateFree(Type *Ty1, Type *Ty2) { 175 return getTLI()->isTruncateFree(Ty1, Ty2); 176 } 177 178 bool isProfitableToHoist(Instruction *I) { 179 return getTLI()->isProfitableToHoist(I); 180 } 181 182 bool isTypeLegal(Type *Ty) { 183 EVT VT = getTLI()->getValueType(DL, Ty); 184 return getTLI()->isTypeLegal(VT); 185 } 186 187 int getGEPCost(Type *PointeeType, const Value *Ptr, 188 ArrayRef<const Value *> Operands) { 189 return BaseT::getGEPCost(PointeeType, Ptr, Operands); 190 } 191 192 int getExtCost(const Instruction *I, const Value *Src) { 193 if (getTLI()->isExtFree(I)) 194 return TargetTransformInfo::TCC_Free; 195 196 if (isa<ZExtInst>(I) || isa<SExtInst>(I)) 197 if (const LoadInst *LI = dyn_cast<LoadInst>(Src)) 198 if (getTLI()->isExtLoad(LI, I, DL)) 199 return TargetTransformInfo::TCC_Free; 200 201 return TargetTransformInfo::TCC_Basic; 202 } 203 204 unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, 205 ArrayRef<const Value *> Arguments) { 206 return BaseT::getIntrinsicCost(IID, RetTy, Arguments); 207 } 208 209 unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, 210 ArrayRef<Type *> ParamTys) { 211 if (IID == Intrinsic::cttz) { 212 if (getTLI()->isCheapToSpeculateCttz()) 213 return TargetTransformInfo::TCC_Basic; 214 return TargetTransformInfo::TCC_Expensive; 215 } 216 217 if (IID == Intrinsic::ctlz) { 218 if (getTLI()->isCheapToSpeculateCtlz()) 219 return TargetTransformInfo::TCC_Basic; 220 return TargetTransformInfo::TCC_Expensive; 221 } 222 223 return BaseT::getIntrinsicCost(IID, RetTy, ParamTys); 224 } 225 226 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, 227 unsigned &JumpTableSize) { 228 /// Try to find the estimated number of clusters. Note that the number of 229 /// clusters identified in this function could be different from the actural 230 /// numbers found in lowering. This function ignore switches that are 231 /// lowered with a mix of jump table / bit test / BTree. This function was 232 /// initially intended to be used when estimating the cost of switch in 233 /// inline cost heuristic, but it's a generic cost model to be used in other 234 /// places (e.g., in loop unrolling). 235 unsigned N = SI.getNumCases(); 236 const TargetLoweringBase *TLI = getTLI(); 237 const DataLayout &DL = this->getDataLayout(); 238 239 JumpTableSize = 0; 240 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent()); 241 242 // Early exit if both a jump table and bit test are not allowed. 243 if (N < 1 || (!IsJTAllowed && DL.getPointerSizeInBits() < N)) 244 return N; 245 246 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); 247 APInt MinCaseVal = MaxCaseVal; 248 for (auto CI : SI.cases()) { 249 const APInt &CaseVal = CI.getCaseValue()->getValue(); 250 if (CaseVal.sgt(MaxCaseVal)) 251 MaxCaseVal = CaseVal; 252 if (CaseVal.slt(MinCaseVal)) 253 MinCaseVal = CaseVal; 254 } 255 256 // Check if suitable for a bit test 257 if (N <= DL.getPointerSizeInBits()) { 258 SmallPtrSet<const BasicBlock *, 4> Dests; 259 for (auto I : SI.cases()) 260 Dests.insert(I.getCaseSuccessor()); 261 262 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal, 263 DL)) 264 return 1; 265 } 266 267 // Check if suitable for a jump table. 268 if (IsJTAllowed) { 269 if (N < 2 || N < TLI->getMinimumJumpTableEntries()) 270 return N; 271 uint64_t Range = 272 (MaxCaseVal - MinCaseVal) 273 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1; 274 // Check whether a range of clusters is dense enough for a jump table 275 if (TLI->isSuitableForJumpTable(&SI, N, Range)) { 276 JumpTableSize = Range; 277 return 1; 278 } 279 } 280 return N; 281 } 282 283 unsigned getJumpBufAlignment() { return getTLI()->getJumpBufAlignment(); } 284 285 unsigned getJumpBufSize() { return getTLI()->getJumpBufSize(); } 286 287 bool shouldBuildLookupTables() { 288 const TargetLoweringBase *TLI = getTLI(); 289 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || 290 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other); 291 } 292 293 bool haveFastSqrt(Type *Ty) { 294 const TargetLoweringBase *TLI = getTLI(); 295 EVT VT = TLI->getValueType(DL, Ty); 296 return TLI->isTypeLegal(VT) && 297 TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); 298 } 299 300 unsigned getFPOpCost(Type *Ty) { 301 // By default, FP instructions are no more expensive since they are 302 // implemented in HW. Target specific TTI can override this. 303 return TargetTransformInfo::TCC_Basic; 304 } 305 306 unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) { 307 const TargetLoweringBase *TLI = getTLI(); 308 switch (Opcode) { 309 default: break; 310 case Instruction::Trunc: 311 if (TLI->isTruncateFree(OpTy, Ty)) 312 return TargetTransformInfo::TCC_Free; 313 return TargetTransformInfo::TCC_Basic; 314 case Instruction::ZExt: 315 if (TLI->isZExtFree(OpTy, Ty)) 316 return TargetTransformInfo::TCC_Free; 317 return TargetTransformInfo::TCC_Basic; 318 } 319 320 return BaseT::getOperationCost(Opcode, Ty, OpTy); 321 } 322 323 unsigned getInliningThresholdMultiplier() { return 1; } 324 325 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 326 TTI::UnrollingPreferences &UP) { 327 // This unrolling functionality is target independent, but to provide some 328 // motivation for its intended use, for x86: 329 330 // According to the Intel 64 and IA-32 Architectures Optimization Reference 331 // Manual, Intel Core models and later have a loop stream detector (and 332 // associated uop queue) that can benefit from partial unrolling. 333 // The relevant requirements are: 334 // - The loop must have no more than 4 (8 for Nehalem and later) branches 335 // taken, and none of them may be calls. 336 // - The loop can have no more than 18 (28 for Nehalem and later) uops. 337 338 // According to the Software Optimization Guide for AMD Family 15h 339 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor 340 // and loop buffer which can benefit from partial unrolling. 341 // The relevant requirements are: 342 // - The loop must have fewer than 16 branches 343 // - The loop must have less than 40 uops in all executed loop branches 344 345 // The number of taken branches in a loop is hard to estimate here, and 346 // benchmarking has revealed that it is better not to be conservative when 347 // estimating the branch count. As a result, we'll ignore the branch limits 348 // until someone finds a case where it matters in practice. 349 350 unsigned MaxOps; 351 const TargetSubtargetInfo *ST = getST(); 352 if (PartialUnrollingThreshold.getNumOccurrences() > 0) 353 MaxOps = PartialUnrollingThreshold; 354 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0) 355 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize; 356 else 357 return; 358 359 // Scan the loop: don't unroll loops with calls. 360 for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; 361 ++I) { 362 BasicBlock *BB = *I; 363 364 for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J) 365 if (isa<CallInst>(J) || isa<InvokeInst>(J)) { 366 ImmutableCallSite CS(&*J); 367 if (const Function *F = CS.getCalledFunction()) { 368 if (!static_cast<T *>(this)->isLoweredToCall(F)) 369 continue; 370 } 371 372 return; 373 } 374 } 375 376 // Enable runtime and partial unrolling up to the specified size. 377 // Enable using trip count upper bound to unroll loops. 378 UP.Partial = UP.Runtime = UP.UpperBound = true; 379 UP.PartialThreshold = MaxOps; 380 381 // Avoid unrolling when optimizing for size. 382 UP.OptSizeThreshold = 0; 383 UP.PartialOptSizeThreshold = 0; 384 385 // Set number of instructions optimized when "back edge" 386 // becomes "fall through" to default value of 2. 387 UP.BEInsns = 2; 388 } 389 390 int getInstructionLatency(const Instruction *I) { 391 if (isa<LoadInst>(I)) 392 return getST()->getSchedModel().DefaultLoadLatency; 393 394 return BaseT::getInstructionLatency(I); 395 } 396 397 /// @} 398 399 /// \name Vector TTI Implementations 400 /// @{ 401 402 unsigned getNumberOfRegisters(bool Vector) { return Vector ? 0 : 1; } 403 404 unsigned getRegisterBitWidth(bool Vector) const { return 32; } 405 406 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 407 /// are set if the result needs to be inserted and/or extracted from vectors. 408 unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { 409 assert(Ty->isVectorTy() && "Can only scalarize vectors"); 410 unsigned Cost = 0; 411 412 for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { 413 if (Insert) 414 Cost += static_cast<T *>(this) 415 ->getVectorInstrCost(Instruction::InsertElement, Ty, i); 416 if (Extract) 417 Cost += static_cast<T *>(this) 418 ->getVectorInstrCost(Instruction::ExtractElement, Ty, i); 419 } 420 421 return Cost; 422 } 423 424 /// Estimate the overhead of scalarizing an instructions unique 425 /// non-constant operands. The types of the arguments are ordinarily 426 /// scalar, in which case the costs are multiplied with VF. 427 unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, 428 unsigned VF) { 429 unsigned Cost = 0; 430 SmallPtrSet<const Value*, 4> UniqueOperands; 431 for (const Value *A : Args) { 432 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) { 433 Type *VecTy = nullptr; 434 if (A->getType()->isVectorTy()) { 435 VecTy = A->getType(); 436 // If A is a vector operand, VF should be 1 or correspond to A. 437 assert((VF == 1 || VF == VecTy->getVectorNumElements()) && 438 "Vector argument does not match VF"); 439 } 440 else 441 VecTy = VectorType::get(A->getType(), VF); 442 443 Cost += getScalarizationOverhead(VecTy, false, true); 444 } 445 } 446 447 return Cost; 448 } 449 450 unsigned getScalarizationOverhead(Type *VecTy, ArrayRef<const Value *> Args) { 451 assert(VecTy->isVectorTy()); 452 453 unsigned Cost = 0; 454 455 Cost += getScalarizationOverhead(VecTy, true, false); 456 if (!Args.empty()) 457 Cost += getOperandsScalarizationOverhead(Args, 458 VecTy->getVectorNumElements()); 459 else 460 // When no information on arguments is provided, we add the cost 461 // associated with one argument as a heuristic. 462 Cost += getScalarizationOverhead(VecTy, false, true); 463 464 return Cost; 465 } 466 467 unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } 468 469 unsigned getArithmeticInstrCost( 470 unsigned Opcode, Type *Ty, 471 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, 472 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, 473 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, 474 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, 475 ArrayRef<const Value *> Args = ArrayRef<const Value *>()) { 476 // Check if any of the operands are vector operands. 477 const TargetLoweringBase *TLI = getTLI(); 478 int ISD = TLI->InstructionOpcodeToISD(Opcode); 479 assert(ISD && "Invalid opcode"); 480 481 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 482 483 bool IsFloat = Ty->isFPOrFPVectorTy(); 484 // Assume that floating point arithmetic operations cost twice as much as 485 // integer operations. 486 unsigned OpCost = (IsFloat ? 2 : 1); 487 488 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 489 // The operation is legal. Assume it costs 1. 490 // TODO: Once we have extract/insert subvector cost we need to use them. 491 return LT.first * OpCost; 492 } 493 494 if (!TLI->isOperationExpand(ISD, LT.second)) { 495 // If the operation is custom lowered, then assume that the code is twice 496 // as expensive. 497 return LT.first * 2 * OpCost; 498 } 499 500 // Else, assume that we need to scalarize this op. 501 // TODO: If one of the types get legalized by splitting, handle this 502 // similarly to what getCastInstrCost() does. 503 if (Ty->isVectorTy()) { 504 unsigned Num = Ty->getVectorNumElements(); 505 unsigned Cost = static_cast<T *>(this) 506 ->getArithmeticInstrCost(Opcode, Ty->getScalarType()); 507 // Return the cost of multiple scalar invocation plus the cost of 508 // inserting and extracting the values. 509 return getScalarizationOverhead(Ty, Args) + Num * Cost; 510 } 511 512 // We don't know anything about this scalar instruction. 513 return OpCost; 514 } 515 516 unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, 517 Type *SubTp) { 518 if (Kind == TTI::SK_Alternate || Kind == TTI::SK_PermuteTwoSrc || 519 Kind == TTI::SK_PermuteSingleSrc) { 520 return getPermuteShuffleOverhead(Tp); 521 } 522 return 1; 523 } 524 525 unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, 526 const Instruction *I = nullptr) { 527 const TargetLoweringBase *TLI = getTLI(); 528 int ISD = TLI->InstructionOpcodeToISD(Opcode); 529 assert(ISD && "Invalid opcode"); 530 std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, Src); 531 std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(DL, Dst); 532 533 // Check for NOOP conversions. 534 if (SrcLT.first == DstLT.first && 535 SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) { 536 537 // Bitcast between types that are legalized to the same type are free. 538 if (Opcode == Instruction::BitCast || Opcode == Instruction::Trunc) 539 return 0; 540 } 541 542 if (Opcode == Instruction::Trunc && 543 TLI->isTruncateFree(SrcLT.second, DstLT.second)) 544 return 0; 545 546 if (Opcode == Instruction::ZExt && 547 TLI->isZExtFree(SrcLT.second, DstLT.second)) 548 return 0; 549 550 if (Opcode == Instruction::AddrSpaceCast && 551 TLI->isNoopAddrSpaceCast(Src->getPointerAddressSpace(), 552 Dst->getPointerAddressSpace())) 553 return 0; 554 555 // If this is a zext/sext of a load, return 0 if the corresponding 556 // extending load exists on target. 557 if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) && 558 I && isa<LoadInst>(I->getOperand(0))) { 559 EVT ExtVT = EVT::getEVT(Dst); 560 EVT LoadVT = EVT::getEVT(Src); 561 unsigned LType = 562 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD); 563 if (TLI->isLoadExtLegal(LType, ExtVT, LoadVT)) 564 return 0; 565 } 566 567 // If the cast is marked as legal (or promote) then assume low cost. 568 if (SrcLT.first == DstLT.first && 569 TLI->isOperationLegalOrPromote(ISD, DstLT.second)) 570 return 1; 571 572 // Handle scalar conversions. 573 if (!Src->isVectorTy() && !Dst->isVectorTy()) { 574 // Scalar bitcasts are usually free. 575 if (Opcode == Instruction::BitCast) 576 return 0; 577 578 // Just check the op cost. If the operation is legal then assume it costs 579 // 1. 580 if (!TLI->isOperationExpand(ISD, DstLT.second)) 581 return 1; 582 583 // Assume that illegal scalar instruction are expensive. 584 return 4; 585 } 586 587 // Check vector-to-vector casts. 588 if (Dst->isVectorTy() && Src->isVectorTy()) { 589 // If the cast is between same-sized registers, then the check is simple. 590 if (SrcLT.first == DstLT.first && 591 SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) { 592 593 // Assume that Zext is done using AND. 594 if (Opcode == Instruction::ZExt) 595 return 1; 596 597 // Assume that sext is done using SHL and SRA. 598 if (Opcode == Instruction::SExt) 599 return 2; 600 601 // Just check the op cost. If the operation is legal then assume it 602 // costs 603 // 1 and multiply by the type-legalization overhead. 604 if (!TLI->isOperationExpand(ISD, DstLT.second)) 605 return SrcLT.first * 1; 606 } 607 608 // If we are legalizing by splitting, query the concrete TTI for the cost 609 // of casting the original vector twice. We also need to factor int the 610 // cost of the split itself. Count that as 1, to be consistent with 611 // TLI->getTypeLegalizationCost(). 612 if ((TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) == 613 TargetLowering::TypeSplitVector) || 614 (TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) == 615 TargetLowering::TypeSplitVector)) { 616 Type *SplitDst = VectorType::get(Dst->getVectorElementType(), 617 Dst->getVectorNumElements() / 2); 618 Type *SplitSrc = VectorType::get(Src->getVectorElementType(), 619 Src->getVectorNumElements() / 2); 620 T *TTI = static_cast<T *>(this); 621 return TTI->getVectorSplitCost() + 622 (2 * TTI->getCastInstrCost(Opcode, SplitDst, SplitSrc, I)); 623 } 624 625 // In other cases where the source or destination are illegal, assume 626 // the operation will get scalarized. 627 unsigned Num = Dst->getVectorNumElements(); 628 unsigned Cost = static_cast<T *>(this)->getCastInstrCost( 629 Opcode, Dst->getScalarType(), Src->getScalarType(), I); 630 631 // Return the cost of multiple scalar invocation plus the cost of 632 // inserting and extracting the values. 633 return getScalarizationOverhead(Dst, true, true) + Num * Cost; 634 } 635 636 // We already handled vector-to-vector and scalar-to-scalar conversions. 637 // This 638 // is where we handle bitcast between vectors and scalars. We need to assume 639 // that the conversion is scalarized in one way or another. 640 if (Opcode == Instruction::BitCast) 641 // Illegal bitcasts are done by storing and loading from a stack slot. 642 return (Src->isVectorTy() ? getScalarizationOverhead(Src, false, true) 643 : 0) + 644 (Dst->isVectorTy() ? getScalarizationOverhead(Dst, true, false) 645 : 0); 646 647 llvm_unreachable("Unhandled cast"); 648 } 649 650 unsigned getExtractWithExtendCost(unsigned Opcode, Type *Dst, 651 VectorType *VecTy, unsigned Index) { 652 return static_cast<T *>(this)->getVectorInstrCost( 653 Instruction::ExtractElement, VecTy, Index) + 654 static_cast<T *>(this)->getCastInstrCost(Opcode, Dst, 655 VecTy->getElementType()); 656 } 657 658 unsigned getCFInstrCost(unsigned Opcode) { 659 // Branches are assumed to be predicted. 660 return 0; 661 } 662 663 unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, 664 const Instruction *I) { 665 const TargetLoweringBase *TLI = getTLI(); 666 int ISD = TLI->InstructionOpcodeToISD(Opcode); 667 assert(ISD && "Invalid opcode"); 668 669 // Selects on vectors are actually vector selects. 670 if (ISD == ISD::SELECT) { 671 assert(CondTy && "CondTy must exist"); 672 if (CondTy->isVectorTy()) 673 ISD = ISD::VSELECT; 674 } 675 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 676 677 if (!(ValTy->isVectorTy() && !LT.second.isVector()) && 678 !TLI->isOperationExpand(ISD, LT.second)) { 679 // The operation is legal. Assume it costs 1. Multiply 680 // by the type-legalization overhead. 681 return LT.first * 1; 682 } 683 684 // Otherwise, assume that the cast is scalarized. 685 // TODO: If one of the types get legalized by splitting, handle this 686 // similarly to what getCastInstrCost() does. 687 if (ValTy->isVectorTy()) { 688 unsigned Num = ValTy->getVectorNumElements(); 689 if (CondTy) 690 CondTy = CondTy->getScalarType(); 691 unsigned Cost = static_cast<T *>(this)->getCmpSelInstrCost( 692 Opcode, ValTy->getScalarType(), CondTy, I); 693 694 // Return the cost of multiple scalar invocation plus the cost of 695 // inserting and extracting the values. 696 return getScalarizationOverhead(ValTy, true, false) + Num * Cost; 697 } 698 699 // Unknown scalar opcode. 700 return 1; 701 } 702 703 unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { 704 std::pair<unsigned, MVT> LT = 705 getTLI()->getTypeLegalizationCost(DL, Val->getScalarType()); 706 707 return LT.first; 708 } 709 710 unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 711 unsigned AddressSpace, const Instruction *I = nullptr) { 712 assert(!Src->isVoidTy() && "Invalid type"); 713 std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Src); 714 715 // Assuming that all loads of legal types cost 1. 716 unsigned Cost = LT.first; 717 718 if (Src->isVectorTy() && 719 Src->getPrimitiveSizeInBits() < LT.second.getSizeInBits()) { 720 // This is a vector load that legalizes to a larger type than the vector 721 // itself. Unless the corresponding extending load or truncating store is 722 // legal, then this will scalarize. 723 TargetLowering::LegalizeAction LA = TargetLowering::Expand; 724 EVT MemVT = getTLI()->getValueType(DL, Src); 725 if (Opcode == Instruction::Store) 726 LA = getTLI()->getTruncStoreAction(LT.second, MemVT); 727 else 728 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT); 729 730 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { 731 // This is a vector load/store for some illegal type that is scalarized. 732 // We must account for the cost of building or decomposing the vector. 733 Cost += getScalarizationOverhead(Src, Opcode != Instruction::Store, 734 Opcode == Instruction::Store); 735 } 736 } 737 738 return Cost; 739 } 740 741 unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, 742 unsigned Factor, 743 ArrayRef<unsigned> Indices, 744 unsigned Alignment, 745 unsigned AddressSpace) { 746 VectorType *VT = dyn_cast<VectorType>(VecTy); 747 assert(VT && "Expect a vector type for interleaved memory op"); 748 749 unsigned NumElts = VT->getNumElements(); 750 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"); 751 752 unsigned NumSubElts = NumElts / Factor; 753 VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts); 754 755 // Firstly, the cost of load/store operation. 756 unsigned Cost = static_cast<T *>(this)->getMemoryOpCost( 757 Opcode, VecTy, Alignment, AddressSpace); 758 759 // Legalize the vector type, and get the legalized and unlegalized type 760 // sizes. 761 MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; 762 unsigned VecTySize = 763 static_cast<T *>(this)->getDataLayout().getTypeStoreSize(VecTy); 764 unsigned VecTyLTSize = VecTyLT.getStoreSize(); 765 766 // Return the ceiling of dividing A by B. 767 auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; }; 768 769 // Scale the cost of the memory operation by the fraction of legalized 770 // instructions that will actually be used. We shouldn't account for the 771 // cost of dead instructions since they will be removed. 772 // 773 // E.g., An interleaved load of factor 8: 774 // %vec = load <16 x i64>, <16 x i64>* %ptr 775 // %v0 = shufflevector %vec, undef, <0, 8> 776 // 777 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be 778 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized 779 // type). The other loads are unused. 780 // 781 // We only scale the cost of loads since interleaved store groups aren't 782 // allowed to have gaps. 783 if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) { 784 // The number of loads of a legal type it will take to represent a load 785 // of the unlegalized vector type. 786 unsigned NumLegalInsts = ceil(VecTySize, VecTyLTSize); 787 788 // The number of elements of the unlegalized type that correspond to a 789 // single legal instruction. 790 unsigned NumEltsPerLegalInst = ceil(NumElts, NumLegalInsts); 791 792 // Determine which legal instructions will be used. 793 BitVector UsedInsts(NumLegalInsts, false); 794 for (unsigned Index : Indices) 795 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt) 796 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst); 797 798 // Scale the cost of the load by the fraction of legal instructions that 799 // will be used. 800 Cost *= UsedInsts.count() / NumLegalInsts; 801 } 802 803 // Then plus the cost of interleave operation. 804 if (Opcode == Instruction::Load) { 805 // The interleave cost is similar to extract sub vectors' elements 806 // from the wide vector, and insert them into sub vectors. 807 // 808 // E.g. An interleaved load of factor 2 (with one member of index 0): 809 // %vec = load <8 x i32>, <8 x i32>* %ptr 810 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 811 // The cost is estimated as extract elements at 0, 2, 4, 6 from the 812 // <8 x i32> vector and insert them into a <4 x i32> vector. 813 814 assert(Indices.size() <= Factor && 815 "Interleaved memory op has too many members"); 816 817 for (unsigned Index : Indices) { 818 assert(Index < Factor && "Invalid index for interleaved memory op"); 819 820 // Extract elements from loaded vector for each sub vector. 821 for (unsigned i = 0; i < NumSubElts; i++) 822 Cost += static_cast<T *>(this)->getVectorInstrCost( 823 Instruction::ExtractElement, VT, Index + i * Factor); 824 } 825 826 unsigned InsSubCost = 0; 827 for (unsigned i = 0; i < NumSubElts; i++) 828 InsSubCost += static_cast<T *>(this)->getVectorInstrCost( 829 Instruction::InsertElement, SubVT, i); 830 831 Cost += Indices.size() * InsSubCost; 832 } else { 833 // The interleave cost is extract all elements from sub vectors, and 834 // insert them into the wide vector. 835 // 836 // E.g. An interleaved store of factor 2: 837 // %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7> 838 // store <8 x i32> %interleaved.vec, <8 x i32>* %ptr 839 // The cost is estimated as extract all elements from both <4 x i32> 840 // vectors and insert into the <8 x i32> vector. 841 842 unsigned ExtSubCost = 0; 843 for (unsigned i = 0; i < NumSubElts; i++) 844 ExtSubCost += static_cast<T *>(this)->getVectorInstrCost( 845 Instruction::ExtractElement, SubVT, i); 846 Cost += ExtSubCost * Factor; 847 848 for (unsigned i = 0; i < NumElts; i++) 849 Cost += static_cast<T *>(this) 850 ->getVectorInstrCost(Instruction::InsertElement, VT, i); 851 } 852 853 return Cost; 854 } 855 856 /// Get intrinsic cost based on arguments. 857 unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, 858 ArrayRef<Value *> Args, FastMathFlags FMF, 859 unsigned VF = 1) { 860 unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1); 861 assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type"); 862 863 switch (IID) { 864 default: { 865 // Assume that we need to scalarize this intrinsic. 866 SmallVector<Type *, 4> Types; 867 for (Value *Op : Args) { 868 Type *OpTy = Op->getType(); 869 assert(VF == 1 || !OpTy->isVectorTy()); 870 Types.push_back(VF == 1 ? OpTy : VectorType::get(OpTy, VF)); 871 } 872 873 if (VF > 1 && !RetTy->isVoidTy()) 874 RetTy = VectorType::get(RetTy, VF); 875 876 // Compute the scalarization overhead based on Args for a vector 877 // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while 878 // CostModel will pass a vector RetTy and VF is 1. 879 unsigned ScalarizationCost = std::numeric_limits<unsigned>::max(); 880 if (RetVF > 1 || VF > 1) { 881 ScalarizationCost = 0; 882 if (!RetTy->isVoidTy()) 883 ScalarizationCost += getScalarizationOverhead(RetTy, true, false); 884 ScalarizationCost += getOperandsScalarizationOverhead(Args, VF); 885 } 886 887 return static_cast<T *>(this)-> 888 getIntrinsicInstrCost(IID, RetTy, Types, FMF, ScalarizationCost); 889 } 890 case Intrinsic::masked_scatter: { 891 assert(VF == 1 && "Can't vectorize types here."); 892 Value *Mask = Args[3]; 893 bool VarMask = !isa<Constant>(Mask); 894 unsigned Alignment = cast<ConstantInt>(Args[2])->getZExtValue(); 895 return 896 static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Store, 897 Args[0]->getType(), 898 Args[1], VarMask, 899 Alignment); 900 } 901 case Intrinsic::masked_gather: { 902 assert(VF == 1 && "Can't vectorize types here."); 903 Value *Mask = Args[2]; 904 bool VarMask = !isa<Constant>(Mask); 905 unsigned Alignment = cast<ConstantInt>(Args[1])->getZExtValue(); 906 return 907 static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Load, 908 RetTy, Args[0], VarMask, 909 Alignment); 910 } 911 } 912 } 913 914 /// Get intrinsic cost based on argument types. 915 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the 916 /// cost of scalarizing the arguments and the return value will be computed 917 /// based on types. 918 unsigned getIntrinsicInstrCost( 919 Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> Tys, FastMathFlags FMF, 920 unsigned ScalarizationCostPassed = std::numeric_limits<unsigned>::max()) { 921 SmallVector<unsigned, 2> ISDs; 922 unsigned SingleCallCost = 10; // Library call cost. Make it expensive. 923 switch (IID) { 924 default: { 925 // Assume that we need to scalarize this intrinsic. 926 unsigned ScalarizationCost = ScalarizationCostPassed; 927 unsigned ScalarCalls = 1; 928 Type *ScalarRetTy = RetTy; 929 if (RetTy->isVectorTy()) { 930 if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max()) 931 ScalarizationCost = getScalarizationOverhead(RetTy, true, false); 932 ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements()); 933 ScalarRetTy = RetTy->getScalarType(); 934 } 935 SmallVector<Type *, 4> ScalarTys; 936 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 937 Type *Ty = Tys[i]; 938 if (Ty->isVectorTy()) { 939 if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max()) 940 ScalarizationCost += getScalarizationOverhead(Ty, false, true); 941 ScalarCalls = std::max(ScalarCalls, Ty->getVectorNumElements()); 942 Ty = Ty->getScalarType(); 943 } 944 ScalarTys.push_back(Ty); 945 } 946 if (ScalarCalls == 1) 947 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. 948 949 unsigned ScalarCost = static_cast<T *>(this)->getIntrinsicInstrCost( 950 IID, ScalarRetTy, ScalarTys, FMF); 951 952 return ScalarCalls * ScalarCost + ScalarizationCost; 953 } 954 // Look for intrinsics that can be lowered directly or turned into a scalar 955 // intrinsic call. 956 case Intrinsic::sqrt: 957 ISDs.push_back(ISD::FSQRT); 958 break; 959 case Intrinsic::sin: 960 ISDs.push_back(ISD::FSIN); 961 break; 962 case Intrinsic::cos: 963 ISDs.push_back(ISD::FCOS); 964 break; 965 case Intrinsic::exp: 966 ISDs.push_back(ISD::FEXP); 967 break; 968 case Intrinsic::exp2: 969 ISDs.push_back(ISD::FEXP2); 970 break; 971 case Intrinsic::log: 972 ISDs.push_back(ISD::FLOG); 973 break; 974 case Intrinsic::log10: 975 ISDs.push_back(ISD::FLOG10); 976 break; 977 case Intrinsic::log2: 978 ISDs.push_back(ISD::FLOG2); 979 break; 980 case Intrinsic::fabs: 981 ISDs.push_back(ISD::FABS); 982 break; 983 case Intrinsic::minnum: 984 ISDs.push_back(ISD::FMINNUM); 985 if (FMF.noNaNs()) 986 ISDs.push_back(ISD::FMINNAN); 987 break; 988 case Intrinsic::maxnum: 989 ISDs.push_back(ISD::FMAXNUM); 990 if (FMF.noNaNs()) 991 ISDs.push_back(ISD::FMAXNAN); 992 break; 993 case Intrinsic::copysign: 994 ISDs.push_back(ISD::FCOPYSIGN); 995 break; 996 case Intrinsic::floor: 997 ISDs.push_back(ISD::FFLOOR); 998 break; 999 case Intrinsic::ceil: 1000 ISDs.push_back(ISD::FCEIL); 1001 break; 1002 case Intrinsic::trunc: 1003 ISDs.push_back(ISD::FTRUNC); 1004 break; 1005 case Intrinsic::nearbyint: 1006 ISDs.push_back(ISD::FNEARBYINT); 1007 break; 1008 case Intrinsic::rint: 1009 ISDs.push_back(ISD::FRINT); 1010 break; 1011 case Intrinsic::round: 1012 ISDs.push_back(ISD::FROUND); 1013 break; 1014 case Intrinsic::pow: 1015 ISDs.push_back(ISD::FPOW); 1016 break; 1017 case Intrinsic::fma: 1018 ISDs.push_back(ISD::FMA); 1019 break; 1020 case Intrinsic::fmuladd: 1021 ISDs.push_back(ISD::FMA); 1022 break; 1023 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. 1024 case Intrinsic::lifetime_start: 1025 case Intrinsic::lifetime_end: 1026 return 0; 1027 case Intrinsic::masked_store: 1028 return static_cast<T *>(this) 1029 ->getMaskedMemoryOpCost(Instruction::Store, Tys[0], 0, 0); 1030 case Intrinsic::masked_load: 1031 return static_cast<T *>(this) 1032 ->getMaskedMemoryOpCost(Instruction::Load, RetTy, 0, 0); 1033 case Intrinsic::ctpop: 1034 ISDs.push_back(ISD::CTPOP); 1035 // In case of legalization use TCC_Expensive. This is cheaper than a 1036 // library call but still not a cheap instruction. 1037 SingleCallCost = TargetTransformInfo::TCC_Expensive; 1038 break; 1039 // FIXME: ctlz, cttz, ... 1040 } 1041 1042 const TargetLoweringBase *TLI = getTLI(); 1043 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy); 1044 1045 SmallVector<unsigned, 2> LegalCost; 1046 SmallVector<unsigned, 2> CustomCost; 1047 for (unsigned ISD : ISDs) { 1048 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 1049 if (IID == Intrinsic::fabs && TLI->isFAbsFree(LT.second)) { 1050 return 0; 1051 } 1052 1053 // The operation is legal. Assume it costs 1. 1054 // If the type is split to multiple registers, assume that there is some 1055 // overhead to this. 1056 // TODO: Once we have extract/insert subvector cost we need to use them. 1057 if (LT.first > 1) 1058 LegalCost.push_back(LT.first * 2); 1059 else 1060 LegalCost.push_back(LT.first * 1); 1061 } else if (!TLI->isOperationExpand(ISD, LT.second)) { 1062 // If the operation is custom lowered then assume 1063 // that the code is twice as expensive. 1064 CustomCost.push_back(LT.first * 2); 1065 } 1066 } 1067 1068 auto MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end()); 1069 if (MinLegalCostI != LegalCost.end()) 1070 return *MinLegalCostI; 1071 1072 auto MinCustomCostI = std::min_element(CustomCost.begin(), CustomCost.end()); 1073 if (MinCustomCostI != CustomCost.end()) 1074 return *MinCustomCostI; 1075 1076 // If we can't lower fmuladd into an FMA estimate the cost as a floating 1077 // point mul followed by an add. 1078 if (IID == Intrinsic::fmuladd) 1079 return static_cast<T *>(this) 1080 ->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) + 1081 static_cast<T *>(this) 1082 ->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy); 1083 1084 // Else, assume that we need to scalarize this intrinsic. For math builtins 1085 // this will emit a costly libcall, adding call overhead and spills. Make it 1086 // very expensive. 1087 if (RetTy->isVectorTy()) { 1088 unsigned ScalarizationCost = 1089 ((ScalarizationCostPassed != std::numeric_limits<unsigned>::max()) 1090 ? ScalarizationCostPassed 1091 : getScalarizationOverhead(RetTy, true, false)); 1092 unsigned ScalarCalls = RetTy->getVectorNumElements(); 1093 SmallVector<Type *, 4> ScalarTys; 1094 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 1095 Type *Ty = Tys[i]; 1096 if (Ty->isVectorTy()) 1097 Ty = Ty->getScalarType(); 1098 ScalarTys.push_back(Ty); 1099 } 1100 unsigned ScalarCost = static_cast<T *>(this)->getIntrinsicInstrCost( 1101 IID, RetTy->getScalarType(), ScalarTys, FMF); 1102 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 1103 if (Tys[i]->isVectorTy()) { 1104 if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max()) 1105 ScalarizationCost += getScalarizationOverhead(Tys[i], false, true); 1106 ScalarCalls = std::max(ScalarCalls, Tys[i]->getVectorNumElements()); 1107 } 1108 } 1109 1110 return ScalarCalls * ScalarCost + ScalarizationCost; 1111 } 1112 1113 // This is going to be turned into a library call, make it expensive. 1114 return SingleCallCost; 1115 } 1116 1117 /// \brief Compute a cost of the given call instruction. 1118 /// 1119 /// Compute the cost of calling function F with return type RetTy and 1120 /// argument types Tys. F might be nullptr, in this case the cost of an 1121 /// arbitrary call with the specified signature will be returned. 1122 /// This is used, for instance, when we estimate call of a vector 1123 /// counterpart of the given function. 1124 /// \param F Called function, might be nullptr. 1125 /// \param RetTy Return value types. 1126 /// \param Tys Argument types. 1127 /// \returns The cost of Call instruction. 1128 unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) { 1129 return 10; 1130 } 1131 1132 unsigned getNumberOfParts(Type *Tp) { 1133 std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Tp); 1134 return LT.first; 1135 } 1136 1137 unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, 1138 const SCEV *) { 1139 return 0; 1140 } 1141 1142 /// Try to calculate arithmetic and shuffle op costs for reduction operations. 1143 /// We're assuming that reduction operation are performing the following way: 1144 /// 1. Non-pairwise reduction 1145 /// %val1 = shufflevector<n x t> %val, <n x t> %undef, 1146 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef> 1147 /// \----------------v-------------/ \----------v------------/ 1148 /// n/2 elements n/2 elements 1149 /// %red1 = op <n x t> %val, <n x t> val1 1150 /// After this operation we have a vector %red1 where only the first n/2 1151 /// elements are meaningful, the second n/2 elements are undefined and can be 1152 /// dropped. All other operations are actually working with the vector of 1153 /// length n/2, not n, though the real vector length is still n. 1154 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef, 1155 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef> 1156 /// \----------------v-------------/ \----------v------------/ 1157 /// n/4 elements 3*n/4 elements 1158 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of 1159 /// length n/2, the resulting vector has length n/4 etc. 1160 /// 2. Pairwise reduction: 1161 /// Everything is the same except for an additional shuffle operation which 1162 /// is used to produce operands for pairwise kind of reductions. 1163 /// %val1 = shufflevector<n x t> %val, <n x t> %undef, 1164 /// <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef> 1165 /// \-------------v----------/ \----------v------------/ 1166 /// n/2 elements n/2 elements 1167 /// %val2 = shufflevector<n x t> %val, <n x t> %undef, 1168 /// <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef> 1169 /// \-------------v----------/ \----------v------------/ 1170 /// n/2 elements n/2 elements 1171 /// %red1 = op <n x t> %val1, <n x t> val2 1172 /// Again, the operation is performed on <n x t> vector, but the resulting 1173 /// vector %red1 is <n/2 x t> vector. 1174 /// 1175 /// The cost model should take into account that the actual length of the 1176 /// vector is reduced on each iteration. 1177 unsigned getArithmeticReductionCost(unsigned Opcode, Type *Ty, 1178 bool IsPairwise) { 1179 assert(Ty->isVectorTy() && "Expect a vector type"); 1180 Type *ScalarTy = Ty->getVectorElementType(); 1181 unsigned NumVecElts = Ty->getVectorNumElements(); 1182 unsigned NumReduxLevels = Log2_32(NumVecElts); 1183 unsigned ArithCost = 0; 1184 unsigned ShuffleCost = 0; 1185 auto *ConcreteTTI = static_cast<T *>(this); 1186 std::pair<unsigned, MVT> LT = 1187 ConcreteTTI->getTLI()->getTypeLegalizationCost(DL, Ty); 1188 unsigned LongVectorCount = 0; 1189 unsigned MVTLen = 1190 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 1191 while (NumVecElts > MVTLen) { 1192 NumVecElts /= 2; 1193 // Assume the pairwise shuffles add a cost. 1194 ShuffleCost += (IsPairwise + 1) * 1195 ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty, 1196 NumVecElts, Ty); 1197 ArithCost += ConcreteTTI->getArithmeticInstrCost(Opcode, Ty); 1198 Ty = VectorType::get(ScalarTy, NumVecElts); 1199 ++LongVectorCount; 1200 } 1201 // The minimal length of the vector is limited by the real length of vector 1202 // operations performed on the current platform. That's why several final 1203 // reduction operations are performed on the vectors with the same 1204 // architecture-dependent length. 1205 ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) * 1206 ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty, 1207 NumVecElts, Ty); 1208 ArithCost += (NumReduxLevels - LongVectorCount) * 1209 ConcreteTTI->getArithmeticInstrCost(Opcode, Ty); 1210 return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true); 1211 } 1212 1213 /// Try to calculate op costs for min/max reduction operations. 1214 /// \param CondTy Conditional type for the Select instruction. 1215 unsigned getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwise, 1216 bool) { 1217 assert(Ty->isVectorTy() && "Expect a vector type"); 1218 Type *ScalarTy = Ty->getVectorElementType(); 1219 Type *ScalarCondTy = CondTy->getVectorElementType(); 1220 unsigned NumVecElts = Ty->getVectorNumElements(); 1221 unsigned NumReduxLevels = Log2_32(NumVecElts); 1222 unsigned CmpOpcode; 1223 if (Ty->isFPOrFPVectorTy()) { 1224 CmpOpcode = Instruction::FCmp; 1225 } else { 1226 assert(Ty->isIntOrIntVectorTy() && 1227 "expecting floating point or integer type for min/max reduction"); 1228 CmpOpcode = Instruction::ICmp; 1229 } 1230 unsigned MinMaxCost = 0; 1231 unsigned ShuffleCost = 0; 1232 auto *ConcreteTTI = static_cast<T *>(this); 1233 std::pair<unsigned, MVT> LT = 1234 ConcreteTTI->getTLI()->getTypeLegalizationCost(DL, Ty); 1235 unsigned LongVectorCount = 0; 1236 unsigned MVTLen = 1237 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 1238 while (NumVecElts > MVTLen) { 1239 NumVecElts /= 2; 1240 // Assume the pairwise shuffles add a cost. 1241 ShuffleCost += (IsPairwise + 1) * 1242 ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty, 1243 NumVecElts, Ty); 1244 MinMaxCost += 1245 ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) + 1246 ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy, 1247 nullptr); 1248 Ty = VectorType::get(ScalarTy, NumVecElts); 1249 CondTy = VectorType::get(ScalarCondTy, NumVecElts); 1250 ++LongVectorCount; 1251 } 1252 // The minimal length of the vector is limited by the real length of vector 1253 // operations performed on the current platform. That's why several final 1254 // reduction opertions are perfomed on the vectors with the same 1255 // architecture-dependent length. 1256 ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) * 1257 ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty, 1258 NumVecElts, Ty); 1259 MinMaxCost += 1260 (NumReduxLevels - LongVectorCount) * 1261 (ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) + 1262 ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy, 1263 nullptr)); 1264 // Need 3 extractelement instructions for scalarization + an additional 1265 // scalar select instruction. 1266 return ShuffleCost + MinMaxCost + 1267 3 * getScalarizationOverhead(Ty, /*Insert=*/false, 1268 /*Extract=*/true) + 1269 ConcreteTTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, 1270 ScalarCondTy, nullptr); 1271 } 1272 1273 unsigned getVectorSplitCost() { return 1; } 1274 1275 /// @} 1276 }; 1277 1278 /// \brief Concrete BasicTTIImpl that can be used if no further customization 1279 /// is needed. 1280 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> { 1281 using BaseT = BasicTTIImplBase<BasicTTIImpl>; 1282 1283 friend class BasicTTIImplBase<BasicTTIImpl>; 1284 1285 const TargetSubtargetInfo *ST; 1286 const TargetLoweringBase *TLI; 1287 1288 const TargetSubtargetInfo *getST() const { return ST; } 1289 const TargetLoweringBase *getTLI() const { return TLI; } 1290 1291 public: 1292 explicit BasicTTIImpl(const TargetMachine *ST, const Function &F); 1293 }; 1294 1295 } // end namespace llvm 1296 1297 #endif // LLVM_CODEGEN_BASICTTIIMPL_H 1298