Home | History | Annotate | Download | only in AArch64
      1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI pass --------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 /// \file
     10 /// This file implements a TargetTransformInfo analysis pass specific to the
     11 /// AArch64 target machine. It uses the target's detailed information to provide
     12 /// more precise answers to certain TTI queries, while letting the target
     13 /// independent and default TTI implementations handle the rest.
     14 ///
     15 //===----------------------------------------------------------------------===//
     16 
     17 #include "AArch64.h"
     18 #include "AArch64TargetMachine.h"
     19 #include "MCTargetDesc/AArch64AddressingModes.h"
     20 #include "llvm/Analysis/TargetTransformInfo.h"
     21 #include "llvm/Support/Debug.h"
     22 #include "llvm/Target/CostTable.h"
     23 #include "llvm/Target/TargetLowering.h"
     24 #include <algorithm>
     25 using namespace llvm;
     26 
     27 #define DEBUG_TYPE "aarch64tti"
     28 
     29 // Declare the pass initialization routine locally as target-specific passes
     30 // don't have a target-wide initialization entry point, and so we rely on the
     31 // pass constructor initialization.
     32 namespace llvm {
     33 void initializeAArch64TTIPass(PassRegistry &);
     34 }
     35 
     36 namespace {
     37 
     38 class AArch64TTI final : public ImmutablePass, public TargetTransformInfo {
     39   const AArch64TargetMachine *TM;
     40   const AArch64Subtarget *ST;
     41   const AArch64TargetLowering *TLI;
     42 
     43   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
     44   /// are set if the result needs to be inserted and/or extracted from vectors.
     45   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
     46 
     47 public:
     48   AArch64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
     49     llvm_unreachable("This pass cannot be directly constructed");
     50   }
     51 
     52   AArch64TTI(const AArch64TargetMachine *TM)
     53       : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
     54         TLI(TM->getTargetLowering()) {
     55     initializeAArch64TTIPass(*PassRegistry::getPassRegistry());
     56   }
     57 
     58   void initializePass() override { pushTTIStack(this); }
     59 
     60   void getAnalysisUsage(AnalysisUsage &AU) const override {
     61     TargetTransformInfo::getAnalysisUsage(AU);
     62   }
     63 
     64   /// Pass identification.
     65   static char ID;
     66 
     67   /// Provide necessary pointer adjustments for the two base classes.
     68   void *getAdjustedAnalysisPointer(const void *ID) override {
     69     if (ID == &TargetTransformInfo::ID)
     70       return (TargetTransformInfo *)this;
     71     return this;
     72   }
     73 
     74   /// \name Scalar TTI Implementations
     75   /// @{
     76   unsigned getIntImmCost(int64_t Val) const;
     77   unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
     78   unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
     79                          Type *Ty) const override;
     80   unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
     81                          Type *Ty) const override;
     82   PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
     83 
     84   /// @}
     85 
     86   /// \name Vector TTI Implementations
     87   /// @{
     88 
     89   unsigned getNumberOfRegisters(bool Vector) const override {
     90     if (Vector) {
     91       if (ST->hasNEON())
     92         return 32;
     93       return 0;
     94     }
     95     return 31;
     96   }
     97 
     98   unsigned getRegisterBitWidth(bool Vector) const override {
     99     if (Vector) {
    100       if (ST->hasNEON())
    101         return 128;
    102       return 0;
    103     }
    104     return 64;
    105   }
    106 
    107   unsigned getMaximumUnrollFactor() const override { return 2; }
    108 
    109   unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const
    110       override;
    111 
    112   unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
    113       override;
    114 
    115   unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
    116                                   OperandValueKind Opd1Info = OK_AnyValue,
    117                                   OperandValueKind Opd2Info = OK_AnyValue) const
    118       override;
    119 
    120   unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
    121 
    122   unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const
    123       override;
    124 
    125   unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
    126                            unsigned AddressSpace) const override;
    127   /// @}
    128 };
    129 
    130 } // end anonymous namespace
    131 
    132 INITIALIZE_AG_PASS(AArch64TTI, TargetTransformInfo, "aarch64tti",
    133                    "AArch64 Target Transform Info", true, true, false)
    134 char AArch64TTI::ID = 0;
    135 
    136 ImmutablePass *
    137 llvm::createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM) {
    138   return new AArch64TTI(TM);
    139 }
    140 
    141 /// \brief Calculate the cost of materializing a 64-bit value. This helper
    142 /// method might only calculate a fraction of a larger immediate. Therefore it
    143 /// is valid to return a cost of ZERO.
    144 unsigned AArch64TTI::getIntImmCost(int64_t Val) const {
    145   // Check if the immediate can be encoded within an instruction.
    146   if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
    147     return 0;
    148 
    149   if (Val < 0)
    150     Val = ~Val;
    151 
    152   // Calculate how many moves we will need to materialize this constant.
    153   unsigned LZ = countLeadingZeros((uint64_t)Val);
    154   return (64 - LZ + 15) / 16;
    155 }
    156 
    157 /// \brief Calculate the cost of materializing the given constant.
    158 unsigned AArch64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
    159   assert(Ty->isIntegerTy());
    160 
    161   unsigned BitSize = Ty->getPrimitiveSizeInBits();
    162   if (BitSize == 0)
    163     return ~0U;
    164 
    165   // Sign-extend all constants to a multiple of 64-bit.
    166   APInt ImmVal = Imm;
    167   if (BitSize & 0x3f)
    168     ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
    169 
    170   // Split the constant into 64-bit chunks and calculate the cost for each
    171   // chunk.
    172   unsigned Cost = 0;
    173   for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
    174     APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
    175     int64_t Val = Tmp.getSExtValue();
    176     Cost += getIntImmCost(Val);
    177   }
    178   // We need at least one instruction to materialze the constant.
    179   return std::max(1U, Cost);
    180 }
    181 
    182 unsigned AArch64TTI::getIntImmCost(unsigned Opcode, unsigned Idx,
    183                                  const APInt &Imm, Type *Ty) const {
    184   assert(Ty->isIntegerTy());
    185 
    186   unsigned BitSize = Ty->getPrimitiveSizeInBits();
    187   // There is no cost model for constants with a bit size of 0. Return TCC_Free
    188   // here, so that constant hoisting will ignore this constant.
    189   if (BitSize == 0)
    190     return TCC_Free;
    191 
    192   unsigned ImmIdx = ~0U;
    193   switch (Opcode) {
    194   default:
    195     return TCC_Free;
    196   case Instruction::GetElementPtr:
    197     // Always hoist the base address of a GetElementPtr.
    198     if (Idx == 0)
    199       return 2 * TCC_Basic;
    200     return TCC_Free;
    201   case Instruction::Store:
    202     ImmIdx = 0;
    203     break;
    204   case Instruction::Add:
    205   case Instruction::Sub:
    206   case Instruction::Mul:
    207   case Instruction::UDiv:
    208   case Instruction::SDiv:
    209   case Instruction::URem:
    210   case Instruction::SRem:
    211   case Instruction::And:
    212   case Instruction::Or:
    213   case Instruction::Xor:
    214   case Instruction::ICmp:
    215     ImmIdx = 1;
    216     break;
    217   // Always return TCC_Free for the shift value of a shift instruction.
    218   case Instruction::Shl:
    219   case Instruction::LShr:
    220   case Instruction::AShr:
    221     if (Idx == 1)
    222       return TCC_Free;
    223     break;
    224   case Instruction::Trunc:
    225   case Instruction::ZExt:
    226   case Instruction::SExt:
    227   case Instruction::IntToPtr:
    228   case Instruction::PtrToInt:
    229   case Instruction::BitCast:
    230   case Instruction::PHI:
    231   case Instruction::Call:
    232   case Instruction::Select:
    233   case Instruction::Ret:
    234   case Instruction::Load:
    235     break;
    236   }
    237 
    238   if (Idx == ImmIdx) {
    239     unsigned NumConstants = (BitSize + 63) / 64;
    240     unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty);
    241     return (Cost <= NumConstants * TCC_Basic)
    242       ? static_cast<unsigned>(TCC_Free) : Cost;
    243   }
    244   return AArch64TTI::getIntImmCost(Imm, Ty);
    245 }
    246 
    247 unsigned AArch64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
    248                                  const APInt &Imm, Type *Ty) const {
    249   assert(Ty->isIntegerTy());
    250 
    251   unsigned BitSize = Ty->getPrimitiveSizeInBits();
    252   // There is no cost model for constants with a bit size of 0. Return TCC_Free
    253   // here, so that constant hoisting will ignore this constant.
    254   if (BitSize == 0)
    255     return TCC_Free;
    256 
    257   switch (IID) {
    258   default:
    259     return TCC_Free;
    260   case Intrinsic::sadd_with_overflow:
    261   case Intrinsic::uadd_with_overflow:
    262   case Intrinsic::ssub_with_overflow:
    263   case Intrinsic::usub_with_overflow:
    264   case Intrinsic::smul_with_overflow:
    265   case Intrinsic::umul_with_overflow:
    266     if (Idx == 1) {
    267       unsigned NumConstants = (BitSize + 63) / 64;
    268       unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty);
    269       return (Cost <= NumConstants * TCC_Basic)
    270         ? static_cast<unsigned>(TCC_Free) : Cost;
    271     }
    272     break;
    273   case Intrinsic::experimental_stackmap:
    274     if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
    275       return TCC_Free;
    276     break;
    277   case Intrinsic::experimental_patchpoint_void:
    278   case Intrinsic::experimental_patchpoint_i64:
    279     if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
    280       return TCC_Free;
    281     break;
    282   }
    283   return AArch64TTI::getIntImmCost(Imm, Ty);
    284 }
    285 
    286 AArch64TTI::PopcntSupportKind
    287 AArch64TTI::getPopcntSupport(unsigned TyWidth) const {
    288   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
    289   if (TyWidth == 32 || TyWidth == 64)
    290     return PSK_FastHardware;
    291   // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
    292   return PSK_Software;
    293 }
    294 
    295 unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
    296                                     Type *Src) const {
    297   int ISD = TLI->InstructionOpcodeToISD(Opcode);
    298   assert(ISD && "Invalid opcode");
    299 
    300   EVT SrcTy = TLI->getValueType(Src);
    301   EVT DstTy = TLI->getValueType(Dst);
    302 
    303   if (!SrcTy.isSimple() || !DstTy.isSimple())
    304     return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
    305 
    306   static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
    307     // LowerVectorINT_TO_FP:
    308     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
    309     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
    310     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
    311     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
    312     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
    313     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
    314 
    315     // Complex: to v2f32
    316     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
    317     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
    318     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
    319     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
    320     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
    321     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
    322 
    323     // Complex: to v4f32
    324     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8,  4 },
    325     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
    326     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8,  3 },
    327     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
    328 
    329     // Complex: to v2f64
    330     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
    331     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
    332     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
    333     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
    334     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
    335     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
    336 
    337 
    338     // LowerVectorFP_TO_INT
    339     { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
    340     { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
    341     { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
    342     { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
    343     { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
    344     { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
    345 
    346     // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
    347     { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
    348     { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
    349     { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f32, 1 },
    350     { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
    351     { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
    352     { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f32, 1 },
    353 
    354     // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
    355     { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
    356     { ISD::FP_TO_SINT, MVT::v4i8,  MVT::v4f32, 2 },
    357     { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
    358     { ISD::FP_TO_UINT, MVT::v4i8,  MVT::v4f32, 2 },
    359 
    360     // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
    361     { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
    362     { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
    363     { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f64, 2 },
    364     { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
    365     { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
    366     { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f64, 2 },
    367   };
    368 
    369   int Idx = ConvertCostTableLookup<MVT>(
    370       ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
    371       SrcTy.getSimpleVT());
    372   if (Idx != -1)
    373     return ConversionTbl[Idx].Cost;
    374 
    375   return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
    376 }
    377 
    378 unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
    379                                       unsigned Index) const {
    380   assert(Val->isVectorTy() && "This must be a vector type");
    381 
    382   if (Index != -1U) {
    383     // Legalize the type.
    384     std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
    385 
    386     // This type is legalized to a scalar type.
    387     if (!LT.second.isVector())
    388       return 0;
    389 
    390     // The type may be split. Normalize the index to the new type.
    391     unsigned Width = LT.second.getVectorNumElements();
    392     Index = Index % Width;
    393 
    394     // The element at index zero is already inside the vector.
    395     if (Index == 0)
    396       return 0;
    397   }
    398 
    399   // All other insert/extracts cost this much.
    400   return 2;
    401 }
    402 
    403 unsigned AArch64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
    404                                           OperandValueKind Opd1Info,
    405                                           OperandValueKind Opd2Info) const {
    406   // Legalize the type.
    407   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
    408 
    409   int ISD = TLI->InstructionOpcodeToISD(Opcode);
    410 
    411   switch (ISD) {
    412   default:
    413     return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
    414                                                        Opd2Info);
    415   case ISD::ADD:
    416   case ISD::MUL:
    417   case ISD::XOR:
    418   case ISD::OR:
    419   case ISD::AND:
    420     // These nodes are marked as 'custom' for combining purposes only.
    421     // We know that they are legal. See LowerAdd in ISelLowering.
    422     return 1 * LT.first;
    423   }
    424 }
    425 
    426 unsigned AArch64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
    427   // Address computations in vectorized code with non-consecutive addresses will
    428   // likely result in more instructions compared to scalar code where the
    429   // computation can more often be merged into the index mode. The resulting
    430   // extra micro-ops can significantly decrease throughput.
    431   unsigned NumVectorInstToHideOverhead = 10;
    432 
    433   if (Ty->isVectorTy() && IsComplex)
    434     return NumVectorInstToHideOverhead;
    435 
    436   // In many cases the address computation is not merged into the instruction
    437   // addressing mode.
    438   return 1;
    439 }
    440 
    441 unsigned AArch64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
    442                                       Type *CondTy) const {
    443 
    444   int ISD = TLI->InstructionOpcodeToISD(Opcode);
    445   // We don't lower vector selects well that are wider than the register width.
    446   if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
    447     // We would need this many instructions to hide the scalarization happening.
    448     unsigned AmortizationCost = 20;
    449     static const TypeConversionCostTblEntry<MVT::SimpleValueType>
    450     VectorSelectTbl[] = {
    451       { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
    452       { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
    453       { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
    454       { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
    455       { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
    456       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
    457     };
    458 
    459     EVT SelCondTy = TLI->getValueType(CondTy);
    460     EVT SelValTy = TLI->getValueType(ValTy);
    461     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
    462       int Idx =
    463           ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
    464                                  SelValTy.getSimpleVT());
    465       if (Idx != -1)
    466         return VectorSelectTbl[Idx].Cost;
    467     }
    468   }
    469   return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
    470 }
    471 
    472 unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
    473                                    unsigned Alignment,
    474                                    unsigned AddressSpace) const {
    475   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
    476 
    477   if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
    478       Src->getVectorElementType()->isIntegerTy(64)) {
    479     // Unaligned stores are extremely inefficient. We don't split
    480     // unaligned v2i64 stores because the negative impact that has shown in
    481     // practice on inlined memcpy code.
    482     // We make v2i64 stores expensive so that we will only vectorize if there
    483     // are 6 other instructions getting vectorized.
    484     unsigned AmortizationCost = 6;
    485 
    486     return LT.first * 2 * AmortizationCost;
    487   }
    488 
    489   if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
    490       Src->getVectorNumElements() < 8) {
    491     // We scalarize the loads/stores because there is not v.4b register and we
    492     // have to promote the elements to v.4h.
    493     unsigned NumVecElts = Src->getVectorNumElements();
    494     unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
    495     // We generate 2 instructions per vector element.
    496     return NumVectorizableInstsToAmortize * NumVecElts * 2;
    497   }
    498 
    499   return LT.first;
    500 }
    501