Home | History | Annotate | Download | only in PowerPC
      1 //===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 
     10 #include "PPCTargetTransformInfo.h"
     11 #include "llvm/Analysis/TargetTransformInfo.h"
     12 #include "llvm/CodeGen/BasicTTIImpl.h"
     13 #include "llvm/Support/CommandLine.h"
     14 #include "llvm/Support/Debug.h"
     15 #include "llvm/Target/CostTable.h"
     16 #include "llvm/Target/TargetLowering.h"
     17 using namespace llvm;
     18 
     19 #define DEBUG_TYPE "ppctti"
     20 
     21 static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
     22 cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
     23 
     24 // This is currently only used for the data prefetch pass which is only enabled
     25 // for BG/Q by default.
     26 static cl::opt<unsigned>
     27 CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64),
     28               cl::desc("The loop prefetch cache line size"));
     29 
     30 //===----------------------------------------------------------------------===//
     31 //
     32 // PPC cost model.
     33 //
     34 //===----------------------------------------------------------------------===//
     35 
     36 TargetTransformInfo::PopcntSupportKind
     37 PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
     38   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
     39   if (ST->hasPOPCNTD() != PPCSubtarget::POPCNTD_Unavailable && TyWidth <= 64)
     40     return ST->hasPOPCNTD() == PPCSubtarget::POPCNTD_Slow ?
     41              TTI::PSK_SlowHardware : TTI::PSK_FastHardware;
     42   return TTI::PSK_Software;
     43 }
     44 
     45 int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
     46   if (DisablePPCConstHoist)
     47     return BaseT::getIntImmCost(Imm, Ty);
     48 
     49   assert(Ty->isIntegerTy());
     50 
     51   unsigned BitSize = Ty->getPrimitiveSizeInBits();
     52   if (BitSize == 0)
     53     return ~0U;
     54 
     55   if (Imm == 0)
     56     return TTI::TCC_Free;
     57 
     58   if (Imm.getBitWidth() <= 64) {
     59     if (isInt<16>(Imm.getSExtValue()))
     60       return TTI::TCC_Basic;
     61 
     62     if (isInt<32>(Imm.getSExtValue())) {
     63       // A constant that can be materialized using lis.
     64       if ((Imm.getZExtValue() & 0xFFFF) == 0)
     65         return TTI::TCC_Basic;
     66 
     67       return 2 * TTI::TCC_Basic;
     68     }
     69   }
     70 
     71   return 4 * TTI::TCC_Basic;
     72 }
     73 
     74 int PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
     75                               Type *Ty) {
     76   if (DisablePPCConstHoist)
     77     return BaseT::getIntImmCost(IID, Idx, Imm, Ty);
     78 
     79   assert(Ty->isIntegerTy());
     80 
     81   unsigned BitSize = Ty->getPrimitiveSizeInBits();
     82   if (BitSize == 0)
     83     return ~0U;
     84 
     85   switch (IID) {
     86   default:
     87     return TTI::TCC_Free;
     88   case Intrinsic::sadd_with_overflow:
     89   case Intrinsic::uadd_with_overflow:
     90   case Intrinsic::ssub_with_overflow:
     91   case Intrinsic::usub_with_overflow:
     92     if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue()))
     93       return TTI::TCC_Free;
     94     break;
     95   case Intrinsic::experimental_stackmap:
     96     if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
     97       return TTI::TCC_Free;
     98     break;
     99   case Intrinsic::experimental_patchpoint_void:
    100   case Intrinsic::experimental_patchpoint_i64:
    101     if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
    102       return TTI::TCC_Free;
    103     break;
    104   }
    105   return PPCTTIImpl::getIntImmCost(Imm, Ty);
    106 }
    107 
    108 int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
    109                               Type *Ty) {
    110   if (DisablePPCConstHoist)
    111     return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty);
    112 
    113   assert(Ty->isIntegerTy());
    114 
    115   unsigned BitSize = Ty->getPrimitiveSizeInBits();
    116   if (BitSize == 0)
    117     return ~0U;
    118 
    119   unsigned ImmIdx = ~0U;
    120   bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
    121        ZeroFree = false;
    122   switch (Opcode) {
    123   default:
    124     return TTI::TCC_Free;
    125   case Instruction::GetElementPtr:
    126     // Always hoist the base address of a GetElementPtr. This prevents the
    127     // creation of new constants for every base constant that gets constant
    128     // folded with the offset.
    129     if (Idx == 0)
    130       return 2 * TTI::TCC_Basic;
    131     return TTI::TCC_Free;
    132   case Instruction::And:
    133     RunFree = true; // (for the rotate-and-mask instructions)
    134     // Fallthrough...
    135   case Instruction::Add:
    136   case Instruction::Or:
    137   case Instruction::Xor:
    138     ShiftedFree = true;
    139     // Fallthrough...
    140   case Instruction::Sub:
    141   case Instruction::Mul:
    142   case Instruction::Shl:
    143   case Instruction::LShr:
    144   case Instruction::AShr:
    145     ImmIdx = 1;
    146     break;
    147   case Instruction::ICmp:
    148     UnsignedFree = true;
    149     ImmIdx = 1;
    150     // Fallthrough... (zero comparisons can use record-form instructions)
    151   case Instruction::Select:
    152     ZeroFree = true;
    153     break;
    154   case Instruction::PHI:
    155   case Instruction::Call:
    156   case Instruction::Ret:
    157   case Instruction::Load:
    158   case Instruction::Store:
    159     break;
    160   }
    161 
    162   if (ZeroFree && Imm == 0)
    163     return TTI::TCC_Free;
    164 
    165   if (Idx == ImmIdx && Imm.getBitWidth() <= 64) {
    166     if (isInt<16>(Imm.getSExtValue()))
    167       return TTI::TCC_Free;
    168 
    169     if (RunFree) {
    170       if (Imm.getBitWidth() <= 32 &&
    171           (isShiftedMask_32(Imm.getZExtValue()) ||
    172            isShiftedMask_32(~Imm.getZExtValue())))
    173         return TTI::TCC_Free;
    174 
    175       if (ST->isPPC64() &&
    176           (isShiftedMask_64(Imm.getZExtValue()) ||
    177            isShiftedMask_64(~Imm.getZExtValue())))
    178         return TTI::TCC_Free;
    179     }
    180 
    181     if (UnsignedFree && isUInt<16>(Imm.getZExtValue()))
    182       return TTI::TCC_Free;
    183 
    184     if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0)
    185       return TTI::TCC_Free;
    186   }
    187 
    188   return PPCTTIImpl::getIntImmCost(Imm, Ty);
    189 }
    190 
    191 void PPCTTIImpl::getUnrollingPreferences(Loop *L,
    192                                          TTI::UnrollingPreferences &UP) {
    193   if (ST->getDarwinDirective() == PPC::DIR_A2) {
    194     // The A2 is in-order with a deep pipeline, and concatenation unrolling
    195     // helps expose latency-hiding opportunities to the instruction scheduler.
    196     UP.Partial = UP.Runtime = true;
    197 
    198     // We unroll a lot on the A2 (hundreds of instructions), and the benefits
    199     // often outweigh the cost of a division to compute the trip count.
    200     UP.AllowExpensiveTripCount = true;
    201   }
    202 
    203   BaseT::getUnrollingPreferences(L, UP);
    204 }
    205 
    206 bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
    207   // On the A2, always unroll aggressively. For QPX unaligned loads, we depend
    208   // on combining the loads generated for consecutive accesses, and failure to
    209   // do so is particularly expensive. This makes it much more likely (compared
    210   // to only using concatenation unrolling).
    211   if (ST->getDarwinDirective() == PPC::DIR_A2)
    212     return true;
    213 
    214   return LoopHasReductions;
    215 }
    216 
    217 bool PPCTTIImpl::enableInterleavedAccessVectorization() {
    218   return true;
    219 }
    220 
    221 unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
    222   if (Vector && !ST->hasAltivec() && !ST->hasQPX())
    223     return 0;
    224   return ST->hasVSX() ? 64 : 32;
    225 }
    226 
    227 unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) {
    228   if (Vector) {
    229     if (ST->hasQPX()) return 256;
    230     if (ST->hasAltivec()) return 128;
    231     return 0;
    232   }
    233 
    234   if (ST->isPPC64())
    235     return 64;
    236   return 32;
    237 
    238 }
    239 
    240 unsigned PPCTTIImpl::getCacheLineSize() {
    241   // This is currently only used for the data prefetch pass which is only
    242   // enabled for BG/Q by default.
    243   return CacheLineSize;
    244 }
    245 
    246 unsigned PPCTTIImpl::getPrefetchDistance() {
    247   // This seems like a reasonable default for the BG/Q (this pass is enabled, by
    248   // default, only on the BG/Q).
    249   return 300;
    250 }
    251 
    252 unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
    253   unsigned Directive = ST->getDarwinDirective();
    254   // The 440 has no SIMD support, but floating-point instructions
    255   // have a 5-cycle latency, so unroll by 5x for latency hiding.
    256   if (Directive == PPC::DIR_440)
    257     return 5;
    258 
    259   // The A2 has no SIMD support, but floating-point instructions
    260   // have a 6-cycle latency, so unroll by 6x for latency hiding.
    261   if (Directive == PPC::DIR_A2)
    262     return 6;
    263 
    264   // FIXME: For lack of any better information, do no harm...
    265   if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500)
    266     return 1;
    267 
    268   // For P7 and P8, floating-point instructions have a 6-cycle latency and
    269   // there are two execution units, so unroll by 12x for latency hiding.
    270   // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
    271   if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
    272       Directive == PPC::DIR_PWR9)
    273     return 12;
    274 
    275   // For most things, modern systems have two execution units (and
    276   // out-of-order execution).
    277   return 2;
    278 }
    279 
    280 int PPCTTIImpl::getArithmeticInstrCost(
    281     unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
    282     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
    283     TTI::OperandValueProperties Opd2PropInfo) {
    284   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
    285 
    286   // Fallback to the default implementation.
    287   return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
    288                                        Opd1PropInfo, Opd2PropInfo);
    289 }
    290 
    291 int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
    292                                Type *SubTp) {
    293   // Legalize the type.
    294   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
    295 
    296   // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
    297   // (at least in the sense that there need only be one non-loop-invariant
    298   // instruction). We need one such shuffle instruction for each actual
    299   // register (this is not true for arbitrary shuffles, but is true for the
    300   // structured types of shuffles covered by TTI::ShuffleKind).
    301   return LT.first;
    302 }
    303 
    304 int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
    305   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
    306 
    307   return BaseT::getCastInstrCost(Opcode, Dst, Src);
    308 }
    309 
    310 int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
    311   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
    312 }
    313 
    314 int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
    315   assert(Val->isVectorTy() && "This must be a vector type");
    316 
    317   int ISD = TLI->InstructionOpcodeToISD(Opcode);
    318   assert(ISD && "Invalid opcode");
    319 
    320   if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
    321     // Double-precision scalars are already located in index #0.
    322     if (Index == 0)
    323       return 0;
    324 
    325     return BaseT::getVectorInstrCost(Opcode, Val, Index);
    326   } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) {
    327     // Floating point scalars are already located in index #0.
    328     if (Index == 0)
    329       return 0;
    330 
    331     return BaseT::getVectorInstrCost(Opcode, Val, Index);
    332   }
    333 
    334   // Estimated cost of a load-hit-store delay.  This was obtained
    335   // experimentally as a minimum needed to prevent unprofitable
    336   // vectorization for the paq8p benchmark.  It may need to be
    337   // raised further if other unprofitable cases remain.
    338   unsigned LHSPenalty = 2;
    339   if (ISD == ISD::INSERT_VECTOR_ELT)
    340     LHSPenalty += 7;
    341 
    342   // Vector element insert/extract with Altivec is very expensive,
    343   // because they require store and reload with the attendant
    344   // processor stall for load-hit-store.  Until VSX is available,
    345   // these need to be estimated as very costly.
    346   if (ISD == ISD::EXTRACT_VECTOR_ELT ||
    347       ISD == ISD::INSERT_VECTOR_ELT)
    348     return LHSPenalty + BaseT::getVectorInstrCost(Opcode, Val, Index);
    349 
    350   return BaseT::getVectorInstrCost(Opcode, Val, Index);
    351 }
    352 
    353 int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
    354                                 unsigned AddressSpace) {
    355   // Legalize the type.
    356   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
    357   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
    358          "Invalid Opcode");
    359 
    360   int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
    361 
    362   // Aligned loads and stores are easy.
    363   unsigned SrcBytes = LT.second.getStoreSize();
    364   if (!SrcBytes || !Alignment || Alignment >= SrcBytes)
    365     return Cost;
    366 
    367   bool IsAltivecType = ST->hasAltivec() &&
    368                        (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
    369                         LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
    370   bool IsVSXType = ST->hasVSX() &&
    371                    (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
    372   bool IsQPXType = ST->hasQPX() &&
    373                    (LT.second == MVT::v4f64 || LT.second == MVT::v4f32);
    374 
    375   // If we can use the permutation-based load sequence, then this is also
    376   // relatively cheap (not counting loop-invariant instructions): one load plus
    377   // one permute (the last load in a series has extra cost, but we're
    378   // neglecting that here). Note that on the P7, we could do unaligned loads
    379   // for Altivec types using the VSX instructions, but that's more expensive
    380   // than using the permutation-based load sequence. On the P8, that's no
    381   // longer true.
    382   if (Opcode == Instruction::Load &&
    383       ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) &&
    384       Alignment >= LT.second.getScalarType().getStoreSize())
    385     return Cost + LT.first; // Add the cost of the permutations.
    386 
    387   // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
    388   // P7, unaligned vector loads are more expensive than the permutation-based
    389   // load sequence, so that might be used instead, but regardless, the net cost
    390   // is about the same (not counting loop-invariant instructions).
    391   if (IsVSXType || (ST->hasVSX() && IsAltivecType))
    392     return Cost;
    393 
    394   // PPC in general does not support unaligned loads and stores. They'll need
    395   // to be decomposed based on the alignment factor.
    396 
    397   // Add the cost of each scalar load or store.
    398   Cost += LT.first*(SrcBytes/Alignment-1);
    399 
    400   // For a vector type, there is also scalarization overhead (only for
    401   // stores, loads are expanded using the vector-load + permutation sequence,
    402   // which is much less expensive).
    403   if (Src->isVectorTy() && Opcode == Instruction::Store)
    404     for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i)
    405       Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
    406 
    407   return Cost;
    408 }
    409 
    410 int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
    411                                            unsigned Factor,
    412                                            ArrayRef<unsigned> Indices,
    413                                            unsigned Alignment,
    414                                            unsigned AddressSpace) {
    415   assert(isa<VectorType>(VecTy) &&
    416          "Expect a vector type for interleaved memory op");
    417 
    418   // Legalize the type.
    419   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy);
    420 
    421   // Firstly, the cost of load/store operation.
    422   int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);
    423 
    424   // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
    425   // (at least in the sense that there need only be one non-loop-invariant
    426   // instruction). For each result vector, we need one shuffle per incoming
    427   // vector (except that the first shuffle can take two incoming vectors
    428   // because it does not need to take itself).
    429   Cost += Factor*(LT.first-1);
    430 
    431   return Cost;
    432 }
    433 
    434