Home | History | Annotate | Download | only in ARM
      1 //===-- ARMTargetTransformInfo.cpp - ARM specific TTI pass ----------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 /// \file
     10 /// This file implements a TargetTransformInfo analysis pass specific to the
     11 /// ARM target machine. It uses the target's detailed information to provide
     12 /// more precise answers to certain TTI queries, while letting the target
     13 /// independent and default TTI implementations handle the rest.
     14 ///
     15 //===----------------------------------------------------------------------===//
     16 
     17 #include "ARM.h"
     18 #include "ARMTargetMachine.h"
     19 #include "llvm/Analysis/TargetTransformInfo.h"
     20 #include "llvm/Support/Debug.h"
     21 #include "llvm/Target/CostTable.h"
     22 #include "llvm/Target/TargetLowering.h"
     23 using namespace llvm;
     24 
     25 #define DEBUG_TYPE "armtti"
     26 
     27 // Declare the pass initialization routine locally as target-specific passes
     28 // don't have a target-wide initialization entry point, and so we rely on the
     29 // pass constructor initialization.
     30 namespace llvm {
     31 void initializeARMTTIPass(PassRegistry &);
     32 }
     33 
     34 namespace {
     35 
     36 class ARMTTI final : public ImmutablePass, public TargetTransformInfo {
     37   const ARMBaseTargetMachine *TM;
     38   const ARMSubtarget *ST;
     39   const ARMTargetLowering *TLI;
     40 
     41   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
     42   /// are set if the result needs to be inserted and/or extracted from vectors.
     43   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
     44 
     45 public:
     46   ARMTTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
     47     llvm_unreachable("This pass cannot be directly constructed");
     48   }
     49 
     50   ARMTTI(const ARMBaseTargetMachine *TM)
     51       : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
     52         TLI(TM->getTargetLowering()) {
     53     initializeARMTTIPass(*PassRegistry::getPassRegistry());
     54   }
     55 
     56   void initializePass() override {
     57     pushTTIStack(this);
     58   }
     59 
     60   void getAnalysisUsage(AnalysisUsage &AU) const override {
     61     TargetTransformInfo::getAnalysisUsage(AU);
     62   }
     63 
     64   /// Pass identification.
     65   static char ID;
     66 
     67   /// Provide necessary pointer adjustments for the two base classes.
     68   void *getAdjustedAnalysisPointer(const void *ID) override {
     69     if (ID == &TargetTransformInfo::ID)
     70       return (TargetTransformInfo*)this;
     71     return this;
     72   }
     73 
     74   /// \name Scalar TTI Implementations
     75   /// @{
     76   using TargetTransformInfo::getIntImmCost;
     77   unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
     78 
     79   /// @}
     80 
     81 
     82   /// \name Vector TTI Implementations
     83   /// @{
     84 
     85   unsigned getNumberOfRegisters(bool Vector) const override {
     86     if (Vector) {
     87       if (ST->hasNEON())
     88         return 16;
     89       return 0;
     90     }
     91 
     92     if (ST->isThumb1Only())
     93       return 8;
     94     return 13;
     95   }
     96 
     97   unsigned getRegisterBitWidth(bool Vector) const override {
     98     if (Vector) {
     99       if (ST->hasNEON())
    100         return 128;
    101       return 0;
    102     }
    103 
    104     return 32;
    105   }
    106 
    107   unsigned getMaximumUnrollFactor() const override {
    108     // These are out of order CPUs:
    109     if (ST->isCortexA15() || ST->isSwift())
    110       return 2;
    111     return 1;
    112   }
    113 
    114   unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
    115                           int Index, Type *SubTp) const override;
    116 
    117   unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
    118                             Type *Src) const override;
    119 
    120   unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
    121                               Type *CondTy) const override;
    122 
    123   unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
    124                               unsigned Index) const override;
    125 
    126   unsigned getAddressComputationCost(Type *Val,
    127                                      bool IsComplex) const override;
    128 
    129   unsigned
    130   getArithmeticInstrCost(unsigned Opcode, Type *Ty,
    131                          OperandValueKind Op1Info = OK_AnyValue,
    132                          OperandValueKind Op2Info = OK_AnyValue) const override;
    133 
    134   unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
    135                            unsigned AddressSpace) const override;
    136   /// @}
    137 };
    138 
    139 } // end anonymous namespace
    140 
    141 INITIALIZE_AG_PASS(ARMTTI, TargetTransformInfo, "armtti",
    142                    "ARM Target Transform Info", true, true, false)
    143 char ARMTTI::ID = 0;
    144 
    145 ImmutablePass *
    146 llvm::createARMTargetTransformInfoPass(const ARMBaseTargetMachine *TM) {
    147   return new ARMTTI(TM);
    148 }
    149 
    150 
    151 unsigned ARMTTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
    152   assert(Ty->isIntegerTy());
    153 
    154   unsigned Bits = Ty->getPrimitiveSizeInBits();
    155   if (Bits == 0 || Bits > 32)
    156     return 4;
    157 
    158   int32_t SImmVal = Imm.getSExtValue();
    159   uint32_t ZImmVal = Imm.getZExtValue();
    160   if (!ST->isThumb()) {
    161     if ((SImmVal >= 0 && SImmVal < 65536) ||
    162         (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
    163         (ARM_AM::getSOImmVal(~ZImmVal) != -1))
    164       return 1;
    165     return ST->hasV6T2Ops() ? 2 : 3;
    166   }
    167   if (ST->isThumb2()) {
    168     if ((SImmVal >= 0 && SImmVal < 65536) ||
    169         (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
    170         (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
    171       return 1;
    172     return ST->hasV6T2Ops() ? 2 : 3;
    173   }
    174   // Thumb1.
    175   if (SImmVal >= 0 && SImmVal < 256)
    176     return 1;
    177   if ((~ZImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
    178     return 2;
    179   // Load from constantpool.
    180   return 3;
    181 }
    182 
    183 unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
    184                                   Type *Src) const {
    185   int ISD = TLI->InstructionOpcodeToISD(Opcode);
    186   assert(ISD && "Invalid opcode");
    187 
    188   // Single to/from double precision conversions.
    189   static const CostTblEntry<MVT::SimpleValueType> NEONFltDblTbl[] = {
    190     // Vector fptrunc/fpext conversions.
    191     { ISD::FP_ROUND,   MVT::v2f64, 2 },
    192     { ISD::FP_EXTEND,  MVT::v2f32, 2 },
    193     { ISD::FP_EXTEND,  MVT::v4f32, 4 }
    194   };
    195 
    196   if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
    197                                           ISD == ISD::FP_EXTEND)) {
    198     std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
    199     int Idx = CostTableLookup(NEONFltDblTbl, ISD, LT.second);
    200     if (Idx != -1)
    201       return LT.first * NEONFltDblTbl[Idx].Cost;
    202   }
    203 
    204   EVT SrcTy = TLI->getValueType(Src);
    205   EVT DstTy = TLI->getValueType(Dst);
    206 
    207   if (!SrcTy.isSimple() || !DstTy.isSimple())
    208     return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
    209 
    210   // Some arithmetic, load and store operations have specific instructions
    211   // to cast up/down their types automatically at no extra cost.
    212   // TODO: Get these tables to know at least what the related operations are.
    213   static const TypeConversionCostTblEntry<MVT::SimpleValueType>
    214   NEONVectorConversionTbl[] = {
    215     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
    216     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
    217     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
    218     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
    219     { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },
    220     { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },
    221 
    222     // The number of vmovl instructions for the extension.
    223     { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
    224     { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
    225     { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
    226     { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
    227     { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
    228     { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
    229     { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
    230     { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
    231     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
    232     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
    233 
    234     // Operations that we legalize using splitting.
    235     { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 6 },
    236     { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 3 },
    237 
    238     // Vector float <-> i32 conversions.
    239     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
    240     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
    241 
    242     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
    243     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
    244     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
    245     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
    246     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
    247     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
    248     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
    249     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
    250     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
    251     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
    252     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
    253     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
    254     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
    255     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
    256     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
    257     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
    258     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
    259     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
    260     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
    261     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
    262 
    263     { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f32, 1 },
    264     { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 1 },
    265     { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32, 3 },
    266     { ISD::FP_TO_UINT,  MVT::v4i8, MVT::v4f32, 3 },
    267     { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f32, 2 },
    268     { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f32, 2 },
    269 
    270     // Vector double <-> i32 conversions.
    271     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
    272     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
    273 
    274     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
    275     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
    276     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
    277     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
    278     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
    279     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
    280 
    281     { ISD::FP_TO_SINT,  MVT::v2i32, MVT::v2f64, 2 },
    282     { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 },
    283     { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 4 },
    284     { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 4 },
    285     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 8 },
    286     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 8 }
    287   };
    288 
    289   if (SrcTy.isVector() && ST->hasNEON()) {
    290     int Idx = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
    291                                      DstTy.getSimpleVT(), SrcTy.getSimpleVT());
    292     if (Idx != -1)
    293       return NEONVectorConversionTbl[Idx].Cost;
    294   }
    295 
    296   // Scalar float to integer conversions.
    297   static const TypeConversionCostTblEntry<MVT::SimpleValueType>
    298   NEONFloatConversionTbl[] = {
    299     { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
    300     { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
    301     { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
    302     { ISD::FP_TO_UINT,  MVT::i1, MVT::f64, 2 },
    303     { ISD::FP_TO_SINT,  MVT::i8, MVT::f32, 2 },
    304     { ISD::FP_TO_UINT,  MVT::i8, MVT::f32, 2 },
    305     { ISD::FP_TO_SINT,  MVT::i8, MVT::f64, 2 },
    306     { ISD::FP_TO_UINT,  MVT::i8, MVT::f64, 2 },
    307     { ISD::FP_TO_SINT,  MVT::i16, MVT::f32, 2 },
    308     { ISD::FP_TO_UINT,  MVT::i16, MVT::f32, 2 },
    309     { ISD::FP_TO_SINT,  MVT::i16, MVT::f64, 2 },
    310     { ISD::FP_TO_UINT,  MVT::i16, MVT::f64, 2 },
    311     { ISD::FP_TO_SINT,  MVT::i32, MVT::f32, 2 },
    312     { ISD::FP_TO_UINT,  MVT::i32, MVT::f32, 2 },
    313     { ISD::FP_TO_SINT,  MVT::i32, MVT::f64, 2 },
    314     { ISD::FP_TO_UINT,  MVT::i32, MVT::f64, 2 },
    315     { ISD::FP_TO_SINT,  MVT::i64, MVT::f32, 10 },
    316     { ISD::FP_TO_UINT,  MVT::i64, MVT::f32, 10 },
    317     { ISD::FP_TO_SINT,  MVT::i64, MVT::f64, 10 },
    318     { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
    319   };
    320   if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
    321     int Idx = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
    322                                      DstTy.getSimpleVT(), SrcTy.getSimpleVT());
    323     if (Idx != -1)
    324         return NEONFloatConversionTbl[Idx].Cost;
    325   }
    326 
    327   // Scalar integer to float conversions.
    328   static const TypeConversionCostTblEntry<MVT::SimpleValueType>
    329   NEONIntegerConversionTbl[] = {
    330     { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
    331     { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
    332     { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
    333     { ISD::UINT_TO_FP,  MVT::f64, MVT::i1, 2 },
    334     { ISD::SINT_TO_FP,  MVT::f32, MVT::i8, 2 },
    335     { ISD::UINT_TO_FP,  MVT::f32, MVT::i8, 2 },
    336     { ISD::SINT_TO_FP,  MVT::f64, MVT::i8, 2 },
    337     { ISD::UINT_TO_FP,  MVT::f64, MVT::i8, 2 },
    338     { ISD::SINT_TO_FP,  MVT::f32, MVT::i16, 2 },
    339     { ISD::UINT_TO_FP,  MVT::f32, MVT::i16, 2 },
    340     { ISD::SINT_TO_FP,  MVT::f64, MVT::i16, 2 },
    341     { ISD::UINT_TO_FP,  MVT::f64, MVT::i16, 2 },
    342     { ISD::SINT_TO_FP,  MVT::f32, MVT::i32, 2 },
    343     { ISD::UINT_TO_FP,  MVT::f32, MVT::i32, 2 },
    344     { ISD::SINT_TO_FP,  MVT::f64, MVT::i32, 2 },
    345     { ISD::UINT_TO_FP,  MVT::f64, MVT::i32, 2 },
    346     { ISD::SINT_TO_FP,  MVT::f32, MVT::i64, 10 },
    347     { ISD::UINT_TO_FP,  MVT::f32, MVT::i64, 10 },
    348     { ISD::SINT_TO_FP,  MVT::f64, MVT::i64, 10 },
    349     { ISD::UINT_TO_FP,  MVT::f64, MVT::i64, 10 }
    350   };
    351 
    352   if (SrcTy.isInteger() && ST->hasNEON()) {
    353     int Idx = ConvertCostTableLookup(NEONIntegerConversionTbl, ISD,
    354                                      DstTy.getSimpleVT(), SrcTy.getSimpleVT());
    355     if (Idx != -1)
    356       return NEONIntegerConversionTbl[Idx].Cost;
    357   }
    358 
    359   // Scalar integer conversion costs.
    360   static const TypeConversionCostTblEntry<MVT::SimpleValueType>
    361   ARMIntegerConversionTbl[] = {
    362     // i16 -> i64 requires two dependent operations.
    363     { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
    364 
    365     // Truncates on i64 are assumed to be free.
    366     { ISD::TRUNCATE,    MVT::i32, MVT::i64, 0 },
    367     { ISD::TRUNCATE,    MVT::i16, MVT::i64, 0 },
    368     { ISD::TRUNCATE,    MVT::i8,  MVT::i64, 0 },
    369     { ISD::TRUNCATE,    MVT::i1,  MVT::i64, 0 }
    370   };
    371 
    372   if (SrcTy.isInteger()) {
    373     int Idx = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
    374                                      DstTy.getSimpleVT(), SrcTy.getSimpleVT());
    375     if (Idx != -1)
    376       return ARMIntegerConversionTbl[Idx].Cost;
    377   }
    378 
    379   return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
    380 }
    381 
    382 unsigned ARMTTI::getVectorInstrCost(unsigned Opcode, Type *ValTy,
    383                                     unsigned Index) const {
    384   // Penalize inserting into an D-subregister. We end up with a three times
    385   // lower estimated throughput on swift.
    386   if (ST->isSwift() &&
    387       Opcode == Instruction::InsertElement &&
    388       ValTy->isVectorTy() &&
    389       ValTy->getScalarSizeInBits() <= 32)
    390     return 3;
    391 
    392   return TargetTransformInfo::getVectorInstrCost(Opcode, ValTy, Index);
    393 }
    394 
    395 unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
    396                                     Type *CondTy) const {
    397 
    398   int ISD = TLI->InstructionOpcodeToISD(Opcode);
    399   // On NEON a a vector select gets lowered to vbsl.
    400   if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
    401     // Lowering of some vector selects is currently far from perfect.
    402     static const TypeConversionCostTblEntry<MVT::SimpleValueType>
    403     NEONVectorSelectTbl[] = {
    404       { ISD::SELECT, MVT::v16i1, MVT::v16i16, 2*16 + 1 + 3*1 + 4*1 },
    405       { ISD::SELECT, MVT::v8i1, MVT::v8i32, 4*8 + 1*3 + 1*4 + 1*2 },
    406       { ISD::SELECT, MVT::v16i1, MVT::v16i32, 4*16 + 1*6 + 1*8 + 1*4 },
    407       { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
    408       { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
    409       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
    410     };
    411 
    412     EVT SelCondTy = TLI->getValueType(CondTy);
    413     EVT SelValTy = TLI->getValueType(ValTy);
    414     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
    415       int Idx = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
    416                                        SelCondTy.getSimpleVT(),
    417                                        SelValTy.getSimpleVT());
    418       if (Idx != -1)
    419         return NEONVectorSelectTbl[Idx].Cost;
    420     }
    421 
    422     std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
    423     return LT.first;
    424   }
    425 
    426   return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
    427 }
    428 
    429 unsigned ARMTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
    430   // Address computations in vectorized code with non-consecutive addresses will
    431   // likely result in more instructions compared to scalar code where the
    432   // computation can more often be merged into the index mode. The resulting
    433   // extra micro-ops can significantly decrease throughput.
    434   unsigned NumVectorInstToHideOverhead = 10;
    435 
    436   if (Ty->isVectorTy() && IsComplex)
    437     return NumVectorInstToHideOverhead;
    438 
    439   // In many cases the address computation is not merged into the instruction
    440   // addressing mode.
    441   return 1;
    442 }
    443 
    444 unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
    445                                 Type *SubTp) const {
    446   // We only handle costs of reverse and alternate shuffles for now.
    447   if (Kind != SK_Reverse && Kind != SK_Alternate)
    448     return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
    449 
    450   if (Kind == SK_Reverse) {
    451     static const CostTblEntry<MVT::SimpleValueType> NEONShuffleTbl[] = {
    452         // Reverse shuffle cost one instruction if we are shuffling within a
    453         // double word (vrev) or two if we shuffle a quad word (vrev, vext).
    454         {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
    455         {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
    456         {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
    457         {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
    458 
    459         {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
    460         {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
    461         {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
    462         {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
    463 
    464     std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
    465 
    466     int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
    467     if (Idx == -1)
    468       return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
    469 
    470     return LT.first * NEONShuffleTbl[Idx].Cost;
    471   }
    472   if (Kind == SK_Alternate) {
    473     static const CostTblEntry<MVT::SimpleValueType> NEONAltShuffleTbl[] = {
    474         // Alt shuffle cost table for ARM. Cost is the number of instructions
    475         // required to create the shuffled vector.
    476 
    477         {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
    478         {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
    479         {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
    480         {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
    481 
    482         {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
    483         {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
    484         {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
    485 
    486         {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
    487 
    488         {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
    489 
    490     std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
    491     int Idx =
    492         CostTableLookup(NEONAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
    493     if (Idx == -1)
    494       return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
    495     return LT.first * NEONAltShuffleTbl[Idx].Cost;
    496   }
    497   return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
    498 }
    499 
    500 unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
    501                                         OperandValueKind Op1Info,
    502                                         OperandValueKind Op2Info) const {
    503 
    504   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
    505   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
    506 
    507   const unsigned FunctionCallDivCost = 20;
    508   const unsigned ReciprocalDivCost = 10;
    509   static const CostTblEntry<MVT::SimpleValueType> CostTbl[] = {
    510     // Division.
    511     // These costs are somewhat random. Choose a cost of 20 to indicate that
    512     // vectorizing devision (added function call) is going to be very expensive.
    513     // Double registers types.
    514     { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
    515     { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
    516     { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
    517     { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
    518     { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
    519     { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
    520     { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
    521     { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
    522     { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
    523     { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
    524     { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
    525     { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
    526     { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
    527     { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
    528     { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
    529     { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
    530     // Quad register types.
    531     { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
    532     { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
    533     { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
    534     { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
    535     { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
    536     { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
    537     { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
    538     { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
    539     { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
    540     { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
    541     { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
    542     { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
    543     { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
    544     { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
    545     { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
    546     { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
    547     // Multiplication.
    548   };
    549 
    550   int Idx = -1;
    551 
    552   if (ST->hasNEON())
    553     Idx = CostTableLookup(CostTbl, ISDOpcode, LT.second);
    554 
    555   if (Idx != -1)
    556     return LT.first * CostTbl[Idx].Cost;
    557 
    558   unsigned Cost =
    559       TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
    560 
    561   // This is somewhat of a hack. The problem that we are facing is that SROA
    562   // creates a sequence of shift, and, or instructions to construct values.
    563   // These sequences are recognized by the ISel and have zero-cost. Not so for
    564   // the vectorized code. Because we have support for v2i64 but not i64 those
    565   // sequences look particularly beneficial to vectorize.
    566   // To work around this we increase the cost of v2i64 operations to make them
    567   // seem less beneficial.
    568   if (LT.second == MVT::v2i64 &&
    569       Op2Info == TargetTransformInfo::OK_UniformConstantValue)
    570     Cost += 4;
    571 
    572   return Cost;
    573 }
    574 
    575 unsigned ARMTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
    576                                  unsigned AddressSpace) const {
    577   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
    578 
    579   if (Src->isVectorTy() && Alignment != 16 &&
    580       Src->getVectorElementType()->isDoubleTy()) {
    581     // Unaligned loads/stores are extremely inefficient.
    582     // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
    583     return LT.first * 4;
    584   }
    585   return LT.first;
    586 }
    587