1 //===-- PPCTargetTransformInfo.cpp - PPC specific TTI pass ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// \file 10 /// This file implements a TargetTransformInfo analysis pass specific to the 11 /// PPC target machine. It uses the target's detailed information to provide 12 /// more precise answers to certain TTI queries, while letting the target 13 /// independent and default TTI implementations handle the rest. 14 /// 15 //===----------------------------------------------------------------------===// 16 17 #include "PPC.h" 18 #include "PPCTargetMachine.h" 19 #include "llvm/Analysis/TargetTransformInfo.h" 20 #include "llvm/Support/CommandLine.h" 21 #include "llvm/Support/Debug.h" 22 #include "llvm/Target/CostTable.h" 23 #include "llvm/Target/TargetLowering.h" 24 using namespace llvm; 25 26 #define DEBUG_TYPE "ppctti" 27 28 static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting", 29 cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden); 30 31 // Declare the pass initialization routine locally as target-specific passes 32 // don't have a target-wide initialization entry point, and so we rely on the 33 // pass constructor initialization. 34 namespace llvm { 35 void initializePPCTTIPass(PassRegistry &); 36 } 37 38 namespace { 39 40 class PPCTTI final : public ImmutablePass, public TargetTransformInfo { 41 const PPCSubtarget *ST; 42 const PPCTargetLowering *TLI; 43 44 public: 45 PPCTTI() : ImmutablePass(ID), ST(nullptr), TLI(nullptr) { 46 llvm_unreachable("This pass cannot be directly constructed"); 47 } 48 49 PPCTTI(const PPCTargetMachine *TM) 50 : ImmutablePass(ID), ST(TM->getSubtargetImpl()), 51 TLI(TM->getTargetLowering()) { 52 initializePPCTTIPass(*PassRegistry::getPassRegistry()); 53 } 54 55 virtual void initializePass() override { 56 pushTTIStack(this); 57 } 58 59 virtual void getAnalysisUsage(AnalysisUsage &AU) const override { 60 TargetTransformInfo::getAnalysisUsage(AU); 61 } 62 63 /// Pass identification. 64 static char ID; 65 66 /// Provide necessary pointer adjustments for the two base classes. 67 virtual void *getAdjustedAnalysisPointer(const void *ID) override { 68 if (ID == &TargetTransformInfo::ID) 69 return (TargetTransformInfo*)this; 70 return this; 71 } 72 73 /// \name Scalar TTI Implementations 74 /// @{ 75 unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override; 76 77 unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, 78 Type *Ty) const override; 79 unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, 80 Type *Ty) const override; 81 82 virtual PopcntSupportKind 83 getPopcntSupport(unsigned TyWidth) const override; 84 virtual void getUnrollingPreferences( 85 Loop *L, UnrollingPreferences &UP) const override; 86 87 /// @} 88 89 /// \name Vector TTI Implementations 90 /// @{ 91 92 virtual unsigned getNumberOfRegisters(bool Vector) const override; 93 virtual unsigned getRegisterBitWidth(bool Vector) const override; 94 virtual unsigned getMaximumUnrollFactor() const override; 95 virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, 96 OperandValueKind, 97 OperandValueKind) const override; 98 virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, 99 int Index, Type *SubTp) const override; 100 virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst, 101 Type *Src) const override; 102 virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 103 Type *CondTy) const override; 104 virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val, 105 unsigned Index) const override; 106 virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src, 107 unsigned Alignment, 108 unsigned AddressSpace) const override; 109 110 /// @} 111 }; 112 113 } // end anonymous namespace 114 115 INITIALIZE_AG_PASS(PPCTTI, TargetTransformInfo, "ppctti", 116 "PPC Target Transform Info", true, true, false) 117 char PPCTTI::ID = 0; 118 119 ImmutablePass * 120 llvm::createPPCTargetTransformInfoPass(const PPCTargetMachine *TM) { 121 return new PPCTTI(TM); 122 } 123 124 125 //===----------------------------------------------------------------------===// 126 // 127 // PPC cost model. 128 // 129 //===----------------------------------------------------------------------===// 130 131 PPCTTI::PopcntSupportKind PPCTTI::getPopcntSupport(unsigned TyWidth) const { 132 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 133 if (ST->hasPOPCNTD() && TyWidth <= 64) 134 return PSK_FastHardware; 135 return PSK_Software; 136 } 137 138 unsigned PPCTTI::getIntImmCost(const APInt &Imm, Type *Ty) const { 139 if (DisablePPCConstHoist) 140 return TargetTransformInfo::getIntImmCost(Imm, Ty); 141 142 assert(Ty->isIntegerTy()); 143 144 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 145 if (BitSize == 0) 146 return ~0U; 147 148 if (Imm == 0) 149 return TCC_Free; 150 151 if (Imm.getBitWidth() <= 64) { 152 if (isInt<16>(Imm.getSExtValue())) 153 return TCC_Basic; 154 155 if (isInt<32>(Imm.getSExtValue())) { 156 // A constant that can be materialized using lis. 157 if ((Imm.getZExtValue() & 0xFFFF) == 0) 158 return TCC_Basic; 159 160 return 2 * TCC_Basic; 161 } 162 } 163 164 return 4 * TCC_Basic; 165 } 166 167 unsigned PPCTTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, 168 const APInt &Imm, Type *Ty) const { 169 if (DisablePPCConstHoist) 170 return TargetTransformInfo::getIntImmCost(IID, Idx, Imm, Ty); 171 172 assert(Ty->isIntegerTy()); 173 174 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 175 if (BitSize == 0) 176 return ~0U; 177 178 switch (IID) { 179 default: return TCC_Free; 180 case Intrinsic::sadd_with_overflow: 181 case Intrinsic::uadd_with_overflow: 182 case Intrinsic::ssub_with_overflow: 183 case Intrinsic::usub_with_overflow: 184 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue())) 185 return TCC_Free; 186 break; 187 } 188 return PPCTTI::getIntImmCost(Imm, Ty); 189 } 190 191 unsigned PPCTTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, 192 Type *Ty) const { 193 if (DisablePPCConstHoist) 194 return TargetTransformInfo::getIntImmCost(Opcode, Idx, Imm, Ty); 195 196 assert(Ty->isIntegerTy()); 197 198 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 199 if (BitSize == 0) 200 return ~0U; 201 202 unsigned ImmIdx = ~0U; 203 bool ShiftedFree = false, RunFree = false, UnsignedFree = false, 204 ZeroFree = false; 205 switch (Opcode) { 206 default: return TCC_Free; 207 case Instruction::GetElementPtr: 208 // Always hoist the base address of a GetElementPtr. This prevents the 209 // creation of new constants for every base constant that gets constant 210 // folded with the offset. 211 if (Idx == 0) 212 return 2 * TCC_Basic; 213 return TCC_Free; 214 case Instruction::And: 215 RunFree = true; // (for the rotate-and-mask instructions) 216 // Fallthrough... 217 case Instruction::Add: 218 case Instruction::Or: 219 case Instruction::Xor: 220 ShiftedFree = true; 221 // Fallthrough... 222 case Instruction::Sub: 223 case Instruction::Mul: 224 case Instruction::Shl: 225 case Instruction::LShr: 226 case Instruction::AShr: 227 ImmIdx = 1; 228 break; 229 case Instruction::ICmp: 230 UnsignedFree = true; 231 ImmIdx = 1; 232 // Fallthrough... (zero comparisons can use record-form instructions) 233 case Instruction::Select: 234 ZeroFree = true; 235 break; 236 case Instruction::PHI: 237 case Instruction::Call: 238 case Instruction::Ret: 239 case Instruction::Load: 240 case Instruction::Store: 241 break; 242 } 243 244 if (ZeroFree && Imm == 0) 245 return TCC_Free; 246 247 if (Idx == ImmIdx && Imm.getBitWidth() <= 64) { 248 if (isInt<16>(Imm.getSExtValue())) 249 return TCC_Free; 250 251 if (RunFree) { 252 if (Imm.getBitWidth() <= 32 && 253 (isShiftedMask_32(Imm.getZExtValue()) || 254 isShiftedMask_32(~Imm.getZExtValue()))) 255 return TCC_Free; 256 257 258 if (ST->isPPC64() && 259 (isShiftedMask_64(Imm.getZExtValue()) || 260 isShiftedMask_64(~Imm.getZExtValue()))) 261 return TCC_Free; 262 } 263 264 if (UnsignedFree && isUInt<16>(Imm.getZExtValue())) 265 return TCC_Free; 266 267 if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0) 268 return TCC_Free; 269 } 270 271 return PPCTTI::getIntImmCost(Imm, Ty); 272 } 273 274 void PPCTTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const { 275 if (ST->getDarwinDirective() == PPC::DIR_A2) { 276 // The A2 is in-order with a deep pipeline, and concatenation unrolling 277 // helps expose latency-hiding opportunities to the instruction scheduler. 278 UP.Partial = UP.Runtime = true; 279 } 280 } 281 282 unsigned PPCTTI::getNumberOfRegisters(bool Vector) const { 283 if (Vector && !ST->hasAltivec()) 284 return 0; 285 return ST->hasVSX() ? 64 : 32; 286 } 287 288 unsigned PPCTTI::getRegisterBitWidth(bool Vector) const { 289 if (Vector) { 290 if (ST->hasAltivec()) return 128; 291 return 0; 292 } 293 294 if (ST->isPPC64()) 295 return 64; 296 return 32; 297 298 } 299 300 unsigned PPCTTI::getMaximumUnrollFactor() const { 301 unsigned Directive = ST->getDarwinDirective(); 302 // The 440 has no SIMD support, but floating-point instructions 303 // have a 5-cycle latency, so unroll by 5x for latency hiding. 304 if (Directive == PPC::DIR_440) 305 return 5; 306 307 // The A2 has no SIMD support, but floating-point instructions 308 // have a 6-cycle latency, so unroll by 6x for latency hiding. 309 if (Directive == PPC::DIR_A2) 310 return 6; 311 312 // FIXME: For lack of any better information, do no harm... 313 if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) 314 return 1; 315 316 // For most things, modern systems have two execution units (and 317 // out-of-order execution). 318 return 2; 319 } 320 321 unsigned PPCTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, 322 OperandValueKind Op1Info, 323 OperandValueKind Op2Info) const { 324 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); 325 326 // Fallback to the default implementation. 327 return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, 328 Op2Info); 329 } 330 331 unsigned PPCTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, 332 Type *SubTp) const { 333 return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); 334 } 335 336 unsigned PPCTTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { 337 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); 338 339 return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); 340 } 341 342 unsigned PPCTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 343 Type *CondTy) const { 344 return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); 345 } 346 347 unsigned PPCTTI::getVectorInstrCost(unsigned Opcode, Type *Val, 348 unsigned Index) const { 349 assert(Val->isVectorTy() && "This must be a vector type"); 350 351 int ISD = TLI->InstructionOpcodeToISD(Opcode); 352 assert(ISD && "Invalid opcode"); 353 354 if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) { 355 // Double-precision scalars are already located in index #0. 356 if (Index == 0) 357 return 0; 358 359 return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); 360 } 361 362 // Estimated cost of a load-hit-store delay. This was obtained 363 // experimentally as a minimum needed to prevent unprofitable 364 // vectorization for the paq8p benchmark. It may need to be 365 // raised further if other unprofitable cases remain. 366 unsigned LHSPenalty = 2; 367 if (ISD == ISD::INSERT_VECTOR_ELT) 368 LHSPenalty += 7; 369 370 // Vector element insert/extract with Altivec is very expensive, 371 // because they require store and reload with the attendant 372 // processor stall for load-hit-store. Until VSX is available, 373 // these need to be estimated as very costly. 374 if (ISD == ISD::EXTRACT_VECTOR_ELT || 375 ISD == ISD::INSERT_VECTOR_ELT) 376 return LHSPenalty + 377 TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); 378 379 return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); 380 } 381 382 unsigned PPCTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 383 unsigned AddressSpace) const { 384 // Legalize the type. 385 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); 386 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 387 "Invalid Opcode"); 388 389 unsigned Cost = 390 TargetTransformInfo::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); 391 392 // VSX loads/stores support unaligned access. 393 if (ST->hasVSX()) { 394 if (LT.second == MVT::v2f64 || LT.second == MVT::v2i64) 395 return Cost; 396 } 397 398 bool UnalignedAltivec = 399 Src->isVectorTy() && 400 Src->getPrimitiveSizeInBits() >= LT.second.getSizeInBits() && 401 LT.second.getSizeInBits() == 128 && 402 Opcode == Instruction::Load; 403 404 // PPC in general does not support unaligned loads and stores. They'll need 405 // to be decomposed based on the alignment factor. 406 unsigned SrcBytes = LT.second.getStoreSize(); 407 if (SrcBytes && Alignment && Alignment < SrcBytes && !UnalignedAltivec) { 408 Cost += LT.first*(SrcBytes/Alignment-1); 409 410 // For a vector type, there is also scalarization overhead (only for 411 // stores, loads are expanded using the vector-load + permutation sequence, 412 // which is much less expensive). 413 if (Src->isVectorTy() && Opcode == Instruction::Store) 414 for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i) 415 Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i); 416 } 417 418 return Cost; 419 } 420 421