Home | History | Annotate | Download | only in PowerPC
      1 //===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 
     10 #include "PPCTargetTransformInfo.h"
     11 #include "llvm/Analysis/TargetTransformInfo.h"
     12 #include "llvm/CodeGen/BasicTTIImpl.h"
     13 #include "llvm/Support/CommandLine.h"
     14 #include "llvm/Support/Debug.h"
     15 #include "llvm/Target/CostTable.h"
     16 #include "llvm/Target/TargetLowering.h"
     17 using namespace llvm;
     18 
     19 #define DEBUG_TYPE "ppctti"
     20 
     21 static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
     22 cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
     23 
     24 //===----------------------------------------------------------------------===//
     25 //
     26 // PPC cost model.
     27 //
     28 //===----------------------------------------------------------------------===//
     29 
     30 TargetTransformInfo::PopcntSupportKind
     31 PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
     32   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
     33   if (ST->hasPOPCNTD() && TyWidth <= 64)
     34     return TTI::PSK_FastHardware;
     35   return TTI::PSK_Software;
     36 }
     37 
     38 int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
     39   if (DisablePPCConstHoist)
     40     return BaseT::getIntImmCost(Imm, Ty);
     41 
     42   assert(Ty->isIntegerTy());
     43 
     44   unsigned BitSize = Ty->getPrimitiveSizeInBits();
     45   if (BitSize == 0)
     46     return ~0U;
     47 
     48   if (Imm == 0)
     49     return TTI::TCC_Free;
     50 
     51   if (Imm.getBitWidth() <= 64) {
     52     if (isInt<16>(Imm.getSExtValue()))
     53       return TTI::TCC_Basic;
     54 
     55     if (isInt<32>(Imm.getSExtValue())) {
     56       // A constant that can be materialized using lis.
     57       if ((Imm.getZExtValue() & 0xFFFF) == 0)
     58         return TTI::TCC_Basic;
     59 
     60       return 2 * TTI::TCC_Basic;
     61     }
     62   }
     63 
     64   return 4 * TTI::TCC_Basic;
     65 }
     66 
     67 int PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
     68                               Type *Ty) {
     69   if (DisablePPCConstHoist)
     70     return BaseT::getIntImmCost(IID, Idx, Imm, Ty);
     71 
     72   assert(Ty->isIntegerTy());
     73 
     74   unsigned BitSize = Ty->getPrimitiveSizeInBits();
     75   if (BitSize == 0)
     76     return ~0U;
     77 
     78   switch (IID) {
     79   default:
     80     return TTI::TCC_Free;
     81   case Intrinsic::sadd_with_overflow:
     82   case Intrinsic::uadd_with_overflow:
     83   case Intrinsic::ssub_with_overflow:
     84   case Intrinsic::usub_with_overflow:
     85     if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue()))
     86       return TTI::TCC_Free;
     87     break;
     88   case Intrinsic::experimental_stackmap:
     89     if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
     90       return TTI::TCC_Free;
     91     break;
     92   case Intrinsic::experimental_patchpoint_void:
     93   case Intrinsic::experimental_patchpoint_i64:
     94     if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
     95       return TTI::TCC_Free;
     96     break;
     97   }
     98   return PPCTTIImpl::getIntImmCost(Imm, Ty);
     99 }
    100 
    101 int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
    102                               Type *Ty) {
    103   if (DisablePPCConstHoist)
    104     return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty);
    105 
    106   assert(Ty->isIntegerTy());
    107 
    108   unsigned BitSize = Ty->getPrimitiveSizeInBits();
    109   if (BitSize == 0)
    110     return ~0U;
    111 
    112   unsigned ImmIdx = ~0U;
    113   bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
    114        ZeroFree = false;
    115   switch (Opcode) {
    116   default:
    117     return TTI::TCC_Free;
    118   case Instruction::GetElementPtr:
    119     // Always hoist the base address of a GetElementPtr. This prevents the
    120     // creation of new constants for every base constant that gets constant
    121     // folded with the offset.
    122     if (Idx == 0)
    123       return 2 * TTI::TCC_Basic;
    124     return TTI::TCC_Free;
    125   case Instruction::And:
    126     RunFree = true; // (for the rotate-and-mask instructions)
    127     // Fallthrough...
    128   case Instruction::Add:
    129   case Instruction::Or:
    130   case Instruction::Xor:
    131     ShiftedFree = true;
    132     // Fallthrough...
    133   case Instruction::Sub:
    134   case Instruction::Mul:
    135   case Instruction::Shl:
    136   case Instruction::LShr:
    137   case Instruction::AShr:
    138     ImmIdx = 1;
    139     break;
    140   case Instruction::ICmp:
    141     UnsignedFree = true;
    142     ImmIdx = 1;
    143     // Fallthrough... (zero comparisons can use record-form instructions)
    144   case Instruction::Select:
    145     ZeroFree = true;
    146     break;
    147   case Instruction::PHI:
    148   case Instruction::Call:
    149   case Instruction::Ret:
    150   case Instruction::Load:
    151   case Instruction::Store:
    152     break;
    153   }
    154 
    155   if (ZeroFree && Imm == 0)
    156     return TTI::TCC_Free;
    157 
    158   if (Idx == ImmIdx && Imm.getBitWidth() <= 64) {
    159     if (isInt<16>(Imm.getSExtValue()))
    160       return TTI::TCC_Free;
    161 
    162     if (RunFree) {
    163       if (Imm.getBitWidth() <= 32 &&
    164           (isShiftedMask_32(Imm.getZExtValue()) ||
    165            isShiftedMask_32(~Imm.getZExtValue())))
    166         return TTI::TCC_Free;
    167 
    168       if (ST->isPPC64() &&
    169           (isShiftedMask_64(Imm.getZExtValue()) ||
    170            isShiftedMask_64(~Imm.getZExtValue())))
    171         return TTI::TCC_Free;
    172     }
    173 
    174     if (UnsignedFree && isUInt<16>(Imm.getZExtValue()))
    175       return TTI::TCC_Free;
    176 
    177     if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0)
    178       return TTI::TCC_Free;
    179   }
    180 
    181   return PPCTTIImpl::getIntImmCost(Imm, Ty);
    182 }
    183 
    184 void PPCTTIImpl::getUnrollingPreferences(Loop *L,
    185                                          TTI::UnrollingPreferences &UP) {
    186   if (ST->getDarwinDirective() == PPC::DIR_A2) {
    187     // The A2 is in-order with a deep pipeline, and concatenation unrolling
    188     // helps expose latency-hiding opportunities to the instruction scheduler.
    189     UP.Partial = UP.Runtime = true;
    190 
    191     // We unroll a lot on the A2 (hundreds of instructions), and the benefits
    192     // often outweigh the cost of a division to compute the trip count.
    193     UP.AllowExpensiveTripCount = true;
    194   }
    195 
    196   BaseT::getUnrollingPreferences(L, UP);
    197 }
    198 
    199 bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
    200   // On the A2, always unroll aggressively. For QPX unaligned loads, we depend
    201   // on combining the loads generated for consecutive accesses, and failure to
    202   // do so is particularly expensive. This makes it much more likely (compared
    203   // to only using concatenation unrolling).
    204   if (ST->getDarwinDirective() == PPC::DIR_A2)
    205     return true;
    206 
    207   return LoopHasReductions;
    208 }
    209 
    210 bool PPCTTIImpl::enableInterleavedAccessVectorization() {
    211   return true;
    212 }
    213 
    214 unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
    215   if (Vector && !ST->hasAltivec() && !ST->hasQPX())
    216     return 0;
    217   return ST->hasVSX() ? 64 : 32;
    218 }
    219 
    220 unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) {
    221   if (Vector) {
    222     if (ST->hasQPX()) return 256;
    223     if (ST->hasAltivec()) return 128;
    224     return 0;
    225   }
    226 
    227   if (ST->isPPC64())
    228     return 64;
    229   return 32;
    230 
    231 }
    232 
    233 unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
    234   unsigned Directive = ST->getDarwinDirective();
    235   // The 440 has no SIMD support, but floating-point instructions
    236   // have a 5-cycle latency, so unroll by 5x for latency hiding.
    237   if (Directive == PPC::DIR_440)
    238     return 5;
    239 
    240   // The A2 has no SIMD support, but floating-point instructions
    241   // have a 6-cycle latency, so unroll by 6x for latency hiding.
    242   if (Directive == PPC::DIR_A2)
    243     return 6;
    244 
    245   // FIXME: For lack of any better information, do no harm...
    246   if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500)
    247     return 1;
    248 
    249   // For P7 and P8, floating-point instructions have a 6-cycle latency and
    250   // there are two execution units, so unroll by 12x for latency hiding.
    251   if (Directive == PPC::DIR_PWR7 ||
    252       Directive == PPC::DIR_PWR8)
    253     return 12;
    254 
    255   // For most things, modern systems have two execution units (and
    256   // out-of-order execution).
    257   return 2;
    258 }
    259 
    260 int PPCTTIImpl::getArithmeticInstrCost(
    261     unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
    262     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
    263     TTI::OperandValueProperties Opd2PropInfo) {
    264   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
    265 
    266   // Fallback to the default implementation.
    267   return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
    268                                        Opd1PropInfo, Opd2PropInfo);
    269 }
    270 
    271 int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
    272                                Type *SubTp) {
    273   // Legalize the type.
    274   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
    275 
    276   // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
    277   // (at least in the sense that there need only be one non-loop-invariant
    278   // instruction). We need one such shuffle instruction for each actual
    279   // register (this is not true for arbitrary shuffles, but is true for the
    280   // structured types of shuffles covered by TTI::ShuffleKind).
    281   return LT.first;
    282 }
    283 
    284 int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
    285   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
    286 
    287   return BaseT::getCastInstrCost(Opcode, Dst, Src);
    288 }
    289 
    290 int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
    291   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
    292 }
    293 
    294 int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
    295   assert(Val->isVectorTy() && "This must be a vector type");
    296 
    297   int ISD = TLI->InstructionOpcodeToISD(Opcode);
    298   assert(ISD && "Invalid opcode");
    299 
    300   if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
    301     // Double-precision scalars are already located in index #0.
    302     if (Index == 0)
    303       return 0;
    304 
    305     return BaseT::getVectorInstrCost(Opcode, Val, Index);
    306   } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) {
    307     // Floating point scalars are already located in index #0.
    308     if (Index == 0)
    309       return 0;
    310 
    311     return BaseT::getVectorInstrCost(Opcode, Val, Index);
    312   }
    313 
    314   // Estimated cost of a load-hit-store delay.  This was obtained
    315   // experimentally as a minimum needed to prevent unprofitable
    316   // vectorization for the paq8p benchmark.  It may need to be
    317   // raised further if other unprofitable cases remain.
    318   unsigned LHSPenalty = 2;
    319   if (ISD == ISD::INSERT_VECTOR_ELT)
    320     LHSPenalty += 7;
    321 
    322   // Vector element insert/extract with Altivec is very expensive,
    323   // because they require store and reload with the attendant
    324   // processor stall for load-hit-store.  Until VSX is available,
    325   // these need to be estimated as very costly.
    326   if (ISD == ISD::EXTRACT_VECTOR_ELT ||
    327       ISD == ISD::INSERT_VECTOR_ELT)
    328     return LHSPenalty + BaseT::getVectorInstrCost(Opcode, Val, Index);
    329 
    330   return BaseT::getVectorInstrCost(Opcode, Val, Index);
    331 }
    332 
    333 int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
    334                                 unsigned AddressSpace) {
    335   // Legalize the type.
    336   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
    337   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
    338          "Invalid Opcode");
    339 
    340   int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
    341 
    342   // Aligned loads and stores are easy.
    343   unsigned SrcBytes = LT.second.getStoreSize();
    344   if (!SrcBytes || !Alignment || Alignment >= SrcBytes)
    345     return Cost;
    346 
    347   bool IsAltivecType = ST->hasAltivec() &&
    348                        (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
    349                         LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
    350   bool IsVSXType = ST->hasVSX() &&
    351                    (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
    352   bool IsQPXType = ST->hasQPX() &&
    353                    (LT.second == MVT::v4f64 || LT.second == MVT::v4f32);
    354 
    355   // If we can use the permutation-based load sequence, then this is also
    356   // relatively cheap (not counting loop-invariant instructions): one load plus
    357   // one permute (the last load in a series has extra cost, but we're
    358   // neglecting that here). Note that on the P7, we should do unaligned loads
    359   // for Altivec types using the VSX instructions, but that's more expensive
    360   // than using the permutation-based load sequence. On the P8, that's no
    361   // longer true.
    362   if (Opcode == Instruction::Load &&
    363       ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) &&
    364       Alignment >= LT.second.getScalarType().getStoreSize())
    365     return Cost + LT.first; // Add the cost of the permutations.
    366 
    367   // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
    368   // P7, unaligned vector loads are more expensive than the permutation-based
    369   // load sequence, so that might be used instead, but regardless, the net cost
    370   // is about the same (not counting loop-invariant instructions).
    371   if (IsVSXType || (ST->hasVSX() && IsAltivecType))
    372     return Cost;
    373 
    374   // PPC in general does not support unaligned loads and stores. They'll need
    375   // to be decomposed based on the alignment factor.
    376 
    377   // Add the cost of each scalar load or store.
    378   Cost += LT.first*(SrcBytes/Alignment-1);
    379 
    380   // For a vector type, there is also scalarization overhead (only for
    381   // stores, loads are expanded using the vector-load + permutation sequence,
    382   // which is much less expensive).
    383   if (Src->isVectorTy() && Opcode == Instruction::Store)
    384     for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i)
    385       Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
    386 
    387   return Cost;
    388 }
    389 
    390 int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
    391                                            unsigned Factor,
    392                                            ArrayRef<unsigned> Indices,
    393                                            unsigned Alignment,
    394                                            unsigned AddressSpace) {
    395   assert(isa<VectorType>(VecTy) &&
    396          "Expect a vector type for interleaved memory op");
    397 
    398   // Legalize the type.
    399   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy);
    400 
    401   // Firstly, the cost of load/store operation.
    402   int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);
    403 
    404   // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
    405   // (at least in the sense that there need only be one non-loop-invariant
    406   // instruction). For each result vector, we need one shuffle per incoming
    407   // vector (except that the first shuffle can take two incoming vectors
    408   // because it does not need to take itself).
    409   Cost += Factor*(LT.first-1);
    410 
    411   return Cost;
    412 }
    413 
    414