Home | History | Annotate | Download | only in R600
      1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 /// \brief This is the parent TargetLowering class for hardware code gen
     12 /// targets.
     13 //
     14 //===----------------------------------------------------------------------===//
     15 
     16 #include "AMDGPUISelLowering.h"
     17 #include "AMDGPU.h"
     18 #include "AMDGPUFrameLowering.h"
     19 #include "AMDGPUIntrinsicInfo.h"
     20 #include "AMDGPURegisterInfo.h"
     21 #include "AMDGPUSubtarget.h"
     22 #include "R600MachineFunctionInfo.h"
     23 #include "SIMachineFunctionInfo.h"
     24 #include "llvm/CodeGen/CallingConvLower.h"
     25 #include "llvm/CodeGen/MachineFunction.h"
     26 #include "llvm/CodeGen/MachineRegisterInfo.h"
     27 #include "llvm/CodeGen/SelectionDAG.h"
     28 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
     29 #include "llvm/IR/DataLayout.h"
     30 #include "llvm/IR/DiagnosticInfo.h"
     31 #include "llvm/IR/DiagnosticPrinter.h"
     32 
     33 using namespace llvm;
     34 
     35 namespace {
     36 
     37 /// Diagnostic information for unimplemented or unsupported feature reporting.
     38 class DiagnosticInfoUnsupported : public DiagnosticInfo {
     39 private:
     40   const Twine &Description;
     41   const Function &Fn;
     42 
     43   static int KindID;
     44 
     45   static int getKindID() {
     46     if (KindID == 0)
     47       KindID = llvm::getNextAvailablePluginDiagnosticKind();
     48     return KindID;
     49   }
     50 
     51 public:
     52   DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
     53                           DiagnosticSeverity Severity = DS_Error)
     54     : DiagnosticInfo(getKindID(), Severity),
     55       Description(Desc),
     56       Fn(Fn) { }
     57 
     58   const Function &getFunction() const { return Fn; }
     59   const Twine &getDescription() const { return Description; }
     60 
     61   void print(DiagnosticPrinter &DP) const override {
     62     DP << "unsupported " << getDescription() << " in " << Fn.getName();
     63   }
     64 
     65   static bool classof(const DiagnosticInfo *DI) {
     66     return DI->getKind() == getKindID();
     67   }
     68 };
     69 
     70 int DiagnosticInfoUnsupported::KindID = 0;
     71 }
     72 
     73 
     74 static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
     75                       CCValAssign::LocInfo LocInfo,
     76                       ISD::ArgFlagsTy ArgFlags, CCState &State) {
     77   unsigned Offset = State.AllocateStack(ValVT.getStoreSize(),
     78                                         ArgFlags.getOrigAlign());
     79   State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
     80 
     81   return true;
     82 }
     83 
     84 #include "AMDGPUGenCallingConv.inc"
     85 
     86 // Find a larger type to do a load / store of a vector with.
     87 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
     88   unsigned StoreSize = VT.getStoreSizeInBits();
     89   if (StoreSize <= 32)
     90     return EVT::getIntegerVT(Ctx, StoreSize);
     91 
     92   assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
     93   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
     94 }
     95 
     96 // Type for a vector that will be loaded to.
     97 EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) {
     98   unsigned StoreSize = VT.getStoreSizeInBits();
     99   if (StoreSize <= 32)
    100     return EVT::getIntegerVT(Ctx, 32);
    101 
    102   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
    103 }
    104 
    105 AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
    106                                            const AMDGPUSubtarget &STI)
    107     : TargetLowering(TM), Subtarget(&STI) {
    108   setOperationAction(ISD::Constant, MVT::i32, Legal);
    109   setOperationAction(ISD::Constant, MVT::i64, Legal);
    110   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
    111   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
    112 
    113   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
    114   setOperationAction(ISD::BRIND, MVT::Other, Expand);
    115 
    116   // We need to custom lower some of the intrinsics
    117   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
    118 
    119   // Library functions.  These default to Expand, but we have instructions
    120   // for them.
    121   setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
    122   setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
    123   setOperationAction(ISD::FPOW,   MVT::f32, Legal);
    124   setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
    125   setOperationAction(ISD::FABS,   MVT::f32, Legal);
    126   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
    127   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
    128   setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
    129   setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
    130   setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
    131 
    132   setOperationAction(ISD::FROUND, MVT::f32, Custom);
    133   setOperationAction(ISD::FROUND, MVT::f64, Custom);
    134 
    135   setOperationAction(ISD::FREM, MVT::f32, Custom);
    136   setOperationAction(ISD::FREM, MVT::f64, Custom);
    137 
    138   // v_mad_f32 does not support denormals according to some sources.
    139   if (!Subtarget->hasFP32Denormals())
    140     setOperationAction(ISD::FMAD, MVT::f32, Legal);
    141 
    142   // Expand to fneg + fadd.
    143   setOperationAction(ISD::FSUB, MVT::f64, Expand);
    144 
    145   // Lower floating point store/load to integer store/load to reduce the number
    146   // of patterns in tablegen.
    147   setOperationAction(ISD::STORE, MVT::f32, Promote);
    148   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
    149 
    150   setOperationAction(ISD::STORE, MVT::v2f32, Promote);
    151   AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
    152 
    153   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
    154   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
    155 
    156   setOperationAction(ISD::STORE, MVT::v8f32, Promote);
    157   AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
    158 
    159   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
    160   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
    161 
    162   setOperationAction(ISD::STORE, MVT::f64, Promote);
    163   AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
    164 
    165   setOperationAction(ISD::STORE, MVT::v2f64, Promote);
    166   AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64);
    167 
    168   // Custom lowering of vector stores is required for local address space
    169   // stores.
    170   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
    171 
    172   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
    173   setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
    174   setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
    175 
    176   // XXX: This can be change to Custom, once ExpandVectorStores can
    177   // handle 64-bit stores.
    178   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
    179 
    180   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
    181   setTruncStoreAction(MVT::i64, MVT::i8, Expand);
    182   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
    183   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
    184   setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand);
    185 
    186 
    187   setOperationAction(ISD::LOAD, MVT::f32, Promote);
    188   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
    189 
    190   setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
    191   AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
    192 
    193   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
    194   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
    195 
    196   setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
    197   AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
    198 
    199   setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
    200   AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
    201 
    202   setOperationAction(ISD::LOAD, MVT::f64, Promote);
    203   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
    204 
    205   setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
    206   AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64);
    207 
    208   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
    209   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
    210   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
    211   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
    212   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
    213   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
    214   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
    215   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
    216   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
    217   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
    218 
    219   // There are no 64-bit extloads. These should be done as a 32-bit extload and
    220   // an extension to 64-bit.
    221   for (MVT VT : MVT::integer_valuetypes()) {
    222     setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
    223     setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
    224     setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
    225   }
    226 
    227   for (MVT VT : MVT::integer_vector_valuetypes()) {
    228     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
    229     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
    230     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
    231     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
    232     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
    233     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
    234     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
    235     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
    236     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
    237     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
    238     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
    239     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
    240   }
    241 
    242   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
    243 
    244   if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
    245     setOperationAction(ISD::FCEIL, MVT::f64, Custom);
    246     setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
    247     setOperationAction(ISD::FRINT, MVT::f64, Custom);
    248     setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
    249   }
    250 
    251   if (!Subtarget->hasBFI()) {
    252     // fcopysign can be done in a single instruction with BFI.
    253     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
    254     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    255   }
    256 
    257   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
    258 
    259   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
    260   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
    261   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
    262   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
    263 
    264   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
    265   for (MVT VT : ScalarIntVTs) {
    266     setOperationAction(ISD::SREM, VT, Expand);
    267     setOperationAction(ISD::SDIV, VT, Expand);
    268 
    269     // GPU does not have divrem function for signed or unsigned.
    270     setOperationAction(ISD::SDIVREM, VT, Custom);
    271     setOperationAction(ISD::UDIVREM, VT, Custom);
    272 
    273     // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
    274     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
    275     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
    276 
    277     setOperationAction(ISD::BSWAP, VT, Expand);
    278     setOperationAction(ISD::CTTZ, VT, Expand);
    279     setOperationAction(ISD::CTLZ, VT, Expand);
    280   }
    281 
    282   if (!Subtarget->hasBCNT(32))
    283     setOperationAction(ISD::CTPOP, MVT::i32, Expand);
    284 
    285   if (!Subtarget->hasBCNT(64))
    286     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
    287 
    288   // The hardware supports 32-bit ROTR, but not ROTL.
    289   setOperationAction(ISD::ROTL, MVT::i32, Expand);
    290   setOperationAction(ISD::ROTL, MVT::i64, Expand);
    291   setOperationAction(ISD::ROTR, MVT::i64, Expand);
    292 
    293   setOperationAction(ISD::MUL, MVT::i64, Expand);
    294   setOperationAction(ISD::MULHU, MVT::i64, Expand);
    295   setOperationAction(ISD::MULHS, MVT::i64, Expand);
    296   setOperationAction(ISD::UDIV, MVT::i32, Expand);
    297   setOperationAction(ISD::UREM, MVT::i32, Expand);
    298   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
    299   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
    300   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
    301   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
    302   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
    303 
    304   if (!Subtarget->hasFFBH())
    305     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
    306 
    307   if (!Subtarget->hasFFBL())
    308     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
    309 
    310   static const MVT::SimpleValueType VectorIntTypes[] = {
    311     MVT::v2i32, MVT::v4i32
    312   };
    313 
    314   for (MVT VT : VectorIntTypes) {
    315     // Expand the following operations for the current type by default.
    316     setOperationAction(ISD::ADD,  VT, Expand);
    317     setOperationAction(ISD::AND,  VT, Expand);
    318     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
    319     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
    320     setOperationAction(ISD::MUL,  VT, Expand);
    321     setOperationAction(ISD::OR,   VT, Expand);
    322     setOperationAction(ISD::SHL,  VT, Expand);
    323     setOperationAction(ISD::SRA,  VT, Expand);
    324     setOperationAction(ISD::SRL,  VT, Expand);
    325     setOperationAction(ISD::ROTL, VT, Expand);
    326     setOperationAction(ISD::ROTR, VT, Expand);
    327     setOperationAction(ISD::SUB,  VT, Expand);
    328     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
    329     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
    330     setOperationAction(ISD::SDIV, VT, Expand);
    331     setOperationAction(ISD::UDIV, VT, Expand);
    332     setOperationAction(ISD::SREM, VT, Expand);
    333     setOperationAction(ISD::UREM, VT, Expand);
    334     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
    335     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
    336     setOperationAction(ISD::SDIVREM, VT, Custom);
    337     setOperationAction(ISD::UDIVREM, VT, Custom);
    338     setOperationAction(ISD::ADDC, VT, Expand);
    339     setOperationAction(ISD::SUBC, VT, Expand);
    340     setOperationAction(ISD::ADDE, VT, Expand);
    341     setOperationAction(ISD::SUBE, VT, Expand);
    342     setOperationAction(ISD::SELECT, VT, Expand);
    343     setOperationAction(ISD::VSELECT, VT, Expand);
    344     setOperationAction(ISD::SELECT_CC, VT, Expand);
    345     setOperationAction(ISD::XOR,  VT, Expand);
    346     setOperationAction(ISD::BSWAP, VT, Expand);
    347     setOperationAction(ISD::CTPOP, VT, Expand);
    348     setOperationAction(ISD::CTTZ, VT, Expand);
    349     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
    350     setOperationAction(ISD::CTLZ, VT, Expand);
    351     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
    352     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
    353   }
    354 
    355   static const MVT::SimpleValueType FloatVectorTypes[] = {
    356     MVT::v2f32, MVT::v4f32
    357   };
    358 
    359   for (MVT VT : FloatVectorTypes) {
    360     setOperationAction(ISD::FABS, VT, Expand);
    361     setOperationAction(ISD::FMINNUM, VT, Expand);
    362     setOperationAction(ISD::FMAXNUM, VT, Expand);
    363     setOperationAction(ISD::FADD, VT, Expand);
    364     setOperationAction(ISD::FCEIL, VT, Expand);
    365     setOperationAction(ISD::FCOS, VT, Expand);
    366     setOperationAction(ISD::FDIV, VT, Expand);
    367     setOperationAction(ISD::FEXP2, VT, Expand);
    368     setOperationAction(ISD::FLOG2, VT, Expand);
    369     setOperationAction(ISD::FREM, VT, Expand);
    370     setOperationAction(ISD::FPOW, VT, Expand);
    371     setOperationAction(ISD::FFLOOR, VT, Expand);
    372     setOperationAction(ISD::FTRUNC, VT, Expand);
    373     setOperationAction(ISD::FMUL, VT, Expand);
    374     setOperationAction(ISD::FMA, VT, Expand);
    375     setOperationAction(ISD::FRINT, VT, Expand);
    376     setOperationAction(ISD::FNEARBYINT, VT, Expand);
    377     setOperationAction(ISD::FSQRT, VT, Expand);
    378     setOperationAction(ISD::FSIN, VT, Expand);
    379     setOperationAction(ISD::FSUB, VT, Expand);
    380     setOperationAction(ISD::FNEG, VT, Expand);
    381     setOperationAction(ISD::SELECT, VT, Expand);
    382     setOperationAction(ISD::VSELECT, VT, Expand);
    383     setOperationAction(ISD::SELECT_CC, VT, Expand);
    384     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
    385     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
    386   }
    387 
    388   setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
    389   setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
    390 
    391   setTargetDAGCombine(ISD::MUL);
    392   setTargetDAGCombine(ISD::SELECT);
    393   setTargetDAGCombine(ISD::SELECT_CC);
    394   setTargetDAGCombine(ISD::STORE);
    395 
    396   setTargetDAGCombine(ISD::FADD);
    397   setTargetDAGCombine(ISD::FSUB);
    398 
    399   setBooleanContents(ZeroOrNegativeOneBooleanContent);
    400   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
    401 
    402   setSchedulingPreference(Sched::RegPressure);
    403   setJumpIsExpensive(true);
    404 
    405   // SI at least has hardware support for floating point exceptions, but no way
    406   // of using or handling them is implemented. They are also optional in OpenCL
    407   // (Section 7.3)
    408   setHasFloatingPointExceptions(false);
    409 
    410   setSelectIsExpensive(false);
    411   PredictableSelectIsExpensive = false;
    412 
    413   // There are no integer divide instructions, and these expand to a pretty
    414   // large sequence of instructions.
    415   setIntDivIsCheap(false);
    416   setPow2SDivIsCheap(false);
    417   setFsqrtIsCheap(true);
    418 
    419   // FIXME: Need to really handle these.
    420   MaxStoresPerMemcpy  = 4096;
    421   MaxStoresPerMemmove = 4096;
    422   MaxStoresPerMemset  = 4096;
    423 }
    424 
    425 //===----------------------------------------------------------------------===//
    426 // Target Information
    427 //===----------------------------------------------------------------------===//
    428 
    429 MVT AMDGPUTargetLowering::getVectorIdxTy() const {
    430   return MVT::i32;
    431 }
    432 
    433 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
    434   return true;
    435 }
    436 
    437 // The backend supports 32 and 64 bit floating point immediates.
    438 // FIXME: Why are we reporting vectors of FP immediates as legal?
    439 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
    440   EVT ScalarVT = VT.getScalarType();
    441   return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64);
    442 }
    443 
    444 // We don't want to shrink f64 / f32 constants.
    445 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
    446   EVT ScalarVT = VT.getScalarType();
    447   return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
    448 }
    449 
    450 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
    451                                                  ISD::LoadExtType,
    452                                                  EVT NewVT) const {
    453 
    454   unsigned NewSize = NewVT.getStoreSizeInBits();
    455 
    456   // If we are reducing to a 32-bit load, this is always better.
    457   if (NewSize == 32)
    458     return true;
    459 
    460   EVT OldVT = N->getValueType(0);
    461   unsigned OldSize = OldVT.getStoreSizeInBits();
    462 
    463   // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
    464   // extloads, so doing one requires using a buffer_load. In cases where we
    465   // still couldn't use a scalar load, using the wider load shouldn't really
    466   // hurt anything.
    467 
    468   // If the old size already had to be an extload, there's no harm in continuing
    469   // to reduce the width.
    470   return (OldSize < 32);
    471 }
    472 
    473 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
    474                                                    EVT CastTy) const {
    475   if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
    476     return true;
    477 
    478   unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits();
    479   unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits();
    480 
    481   return ((LScalarSize <= CastScalarSize) ||
    482           (CastScalarSize >= 32) ||
    483           (LScalarSize < 32));
    484 }
    485 
    486 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
    487 // profitable with the expansion for 64-bit since it's generally good to
    488 // speculate things.
    489 // FIXME: These should really have the size as a parameter.
    490 bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
    491   return true;
    492 }
    493 
    494 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
    495   return true;
    496 }
    497 
    498 //===---------------------------------------------------------------------===//
    499 // Target Properties
    500 //===---------------------------------------------------------------------===//
    501 
    502 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
    503   assert(VT.isFloatingPoint());
    504   return VT == MVT::f32 || VT == MVT::f64;
    505 }
    506 
    507 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
    508   assert(VT.isFloatingPoint());
    509   return VT == MVT::f32 || VT == MVT::f64;
    510 }
    511 
    512 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
    513   // Truncate is just accessing a subregister.
    514   return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0);
    515 }
    516 
    517 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
    518   // Truncate is just accessing a subregister.
    519   return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() &&
    520          (Dest->getPrimitiveSizeInBits() % 32 == 0);
    521 }
    522 
    523 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
    524   const DataLayout *DL = getDataLayout();
    525   unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType());
    526   unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType());
    527 
    528   return SrcSize == 32 && DestSize == 64;
    529 }
    530 
    531 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
    532   // Any register load of a 64-bit value really requires 2 32-bit moves. For all
    533   // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
    534   // this will enable reducing 64-bit operations the 32-bit, which is always
    535   // good.
    536   return Src == MVT::i32 && Dest == MVT::i64;
    537 }
    538 
    539 bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
    540   return isZExtFree(Val.getValueType(), VT2);
    541 }
    542 
    543 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
    544   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
    545   // limited number of native 64-bit operations. Shrinking an operation to fit
    546   // in a single 32-bit register should always be helpful. As currently used,
    547   // this is much less general than the name suggests, and is only used in
    548   // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
    549   // not profitable, and may actually be harmful.
    550   return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
    551 }
    552 
    553 //===---------------------------------------------------------------------===//
    554 // TargetLowering Callbacks
    555 //===---------------------------------------------------------------------===//
    556 
    557 void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
    558                              const SmallVectorImpl<ISD::InputArg> &Ins) const {
    559 
    560   State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
    561 }
    562 
    563 SDValue AMDGPUTargetLowering::LowerReturn(
    564                                      SDValue Chain,
    565                                      CallingConv::ID CallConv,
    566                                      bool isVarArg,
    567                                      const SmallVectorImpl<ISD::OutputArg> &Outs,
    568                                      const SmallVectorImpl<SDValue> &OutVals,
    569                                      SDLoc DL, SelectionDAG &DAG) const {
    570   return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain);
    571 }
    572 
    573 //===---------------------------------------------------------------------===//
    574 // Target specific lowering
    575 //===---------------------------------------------------------------------===//
    576 
    577 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
    578                                         SmallVectorImpl<SDValue> &InVals) const {
    579   SDValue Callee = CLI.Callee;
    580   SelectionDAG &DAG = CLI.DAG;
    581 
    582   const Function &Fn = *DAG.getMachineFunction().getFunction();
    583 
    584   StringRef FuncName("<unknown>");
    585 
    586   if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
    587     FuncName = G->getSymbol();
    588   else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
    589     FuncName = G->getGlobal()->getName();
    590 
    591   DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName);
    592   DAG.getContext()->diagnose(NoCalls);
    593   return SDValue();
    594 }
    595 
    596 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
    597                                              SelectionDAG &DAG) const {
    598   switch (Op.getOpcode()) {
    599   default:
    600     Op.getNode()->dump();
    601     llvm_unreachable("Custom lowering code for this"
    602                      "instruction is not implemented yet!");
    603     break;
    604   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
    605   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
    606   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
    607   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
    608   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
    609   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
    610   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
    611   case ISD::FREM: return LowerFREM(Op, DAG);
    612   case ISD::FCEIL: return LowerFCEIL(Op, DAG);
    613   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
    614   case ISD::FRINT: return LowerFRINT(Op, DAG);
    615   case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
    616   case ISD::FROUND: return LowerFROUND(Op, DAG);
    617   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
    618   case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
    619   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
    620   case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
    621   case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
    622   }
    623   return Op;
    624 }
    625 
    626 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
    627                                               SmallVectorImpl<SDValue> &Results,
    628                                               SelectionDAG &DAG) const {
    629   switch (N->getOpcode()) {
    630   case ISD::SIGN_EXTEND_INREG:
    631     // Different parts of legalization seem to interpret which type of
    632     // sign_extend_inreg is the one to check for custom lowering. The extended
    633     // from type is what really matters, but some places check for custom
    634     // lowering of the result type. This results in trying to use
    635     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
    636     // nothing here and let the illegal result integer be handled normally.
    637     return;
    638   case ISD::LOAD: {
    639     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
    640     if (!Node)
    641       return;
    642 
    643     Results.push_back(SDValue(Node, 0));
    644     Results.push_back(SDValue(Node, 1));
    645     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
    646     // function
    647     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
    648     return;
    649   }
    650   case ISD::STORE: {
    651     SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG);
    652     if (Lowered.getNode())
    653       Results.push_back(Lowered);
    654     return;
    655   }
    656   default:
    657     return;
    658   }
    659 }
    660 
    661 // FIXME: This implements accesses to initialized globals in the constant
    662 // address space by copying them to private and accessing that. It does not
    663 // properly handle illegal types or vectors. The private vector loads are not
    664 // scalarized, and the illegal scalars hit an assertion. This technique will not
    665 // work well with large initializers, and this should eventually be
    666 // removed. Initialized globals should be placed into a data section that the
    667 // runtime will load into a buffer before the kernel is executed. Uses of the
    668 // global need to be replaced with a pointer loaded from an implicit kernel
    669 // argument into this buffer holding the copy of the data, which will remove the
    670 // need for any of this.
    671 SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
    672                                                        const GlobalValue *GV,
    673                                                        const SDValue &InitPtr,
    674                                                        SDValue Chain,
    675                                                        SelectionDAG &DAG) const {
    676   const DataLayout *TD = getDataLayout();
    677   SDLoc DL(InitPtr);
    678   Type *InitTy = Init->getType();
    679 
    680   if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) {
    681     EVT VT = EVT::getEVT(InitTy);
    682     PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
    683     return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr,
    684                         MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
    685                         TD->getPrefTypeAlignment(InitTy));
    686   }
    687 
    688   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
    689     EVT VT = EVT::getEVT(CFP->getType());
    690     PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
    691     return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, VT), InitPtr,
    692                  MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
    693                  TD->getPrefTypeAlignment(CFP->getType()));
    694   }
    695 
    696   if (StructType *ST = dyn_cast<StructType>(InitTy)) {
    697     const StructLayout *SL = TD->getStructLayout(ST);
    698 
    699     EVT PtrVT = InitPtr.getValueType();
    700     SmallVector<SDValue, 8> Chains;
    701 
    702     for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) {
    703       SDValue Offset = DAG.getConstant(SL->getElementOffset(I), PtrVT);
    704       SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
    705 
    706       Constant *Elt = Init->getAggregateElement(I);
    707       Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
    708     }
    709 
    710     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
    711   }
    712 
    713   if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) {
    714     EVT PtrVT = InitPtr.getValueType();
    715 
    716     unsigned NumElements;
    717     if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy))
    718       NumElements = AT->getNumElements();
    719     else if (VectorType *VT = dyn_cast<VectorType>(SeqTy))
    720       NumElements = VT->getNumElements();
    721     else
    722       llvm_unreachable("Unexpected type");
    723 
    724     unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType());
    725     SmallVector<SDValue, 8> Chains;
    726     for (unsigned i = 0; i < NumElements; ++i) {
    727       SDValue Offset = DAG.getConstant(i * EltSize, PtrVT);
    728       SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
    729 
    730       Constant *Elt = Init->getAggregateElement(i);
    731       Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
    732     }
    733 
    734     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
    735   }
    736 
    737   if (isa<UndefValue>(Init)) {
    738     EVT VT = EVT::getEVT(InitTy);
    739     PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
    740     return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr,
    741                         MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
    742                         TD->getPrefTypeAlignment(InitTy));
    743   }
    744 
    745   Init->dump();
    746   llvm_unreachable("Unhandled constant initializer");
    747 }
    748 
    749 static bool hasDefinedInitializer(const GlobalValue *GV) {
    750   const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
    751   if (!GVar || !GVar->hasInitializer())
    752     return false;
    753 
    754   if (isa<UndefValue>(GVar->getInitializer()))
    755     return false;
    756 
    757   return true;
    758 }
    759 
    760 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
    761                                                  SDValue Op,
    762                                                  SelectionDAG &DAG) const {
    763 
    764   const DataLayout *TD = getDataLayout();
    765   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
    766   const GlobalValue *GV = G->getGlobal();
    767 
    768   switch (G->getAddressSpace()) {
    769   case AMDGPUAS::LOCAL_ADDRESS: {
    770     // XXX: What does the value of G->getOffset() mean?
    771     assert(G->getOffset() == 0 &&
    772          "Do not know what to do with an non-zero offset");
    773 
    774     // TODO: We could emit code to handle the initialization somewhere.
    775     if (hasDefinedInitializer(GV))
    776       break;
    777 
    778     unsigned Offset;
    779     if (MFI->LocalMemoryObjects.count(GV) == 0) {
    780       uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
    781       Offset = MFI->LDSSize;
    782       MFI->LocalMemoryObjects[GV] = Offset;
    783       // XXX: Account for alignment?
    784       MFI->LDSSize += Size;
    785     } else {
    786       Offset = MFI->LocalMemoryObjects[GV];
    787     }
    788 
    789     return DAG.getConstant(Offset, getPointerTy(AMDGPUAS::LOCAL_ADDRESS));
    790   }
    791   case AMDGPUAS::CONSTANT_ADDRESS: {
    792     MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
    793     Type *EltType = GV->getType()->getElementType();
    794     unsigned Size = TD->getTypeAllocSize(EltType);
    795     unsigned Alignment = TD->getPrefTypeAlignment(EltType);
    796 
    797     MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS);
    798     MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS);
    799 
    800     int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
    801     SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT);
    802 
    803     const GlobalVariable *Var = cast<GlobalVariable>(GV);
    804     if (!Var->hasInitializer()) {
    805       // This has no use, but bugpoint will hit it.
    806       return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
    807     }
    808 
    809     const Constant *Init = Var->getInitializer();
    810     SmallVector<SDNode*, 8> WorkList;
    811 
    812     for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(),
    813                               E = DAG.getEntryNode()->use_end(); I != E; ++I) {
    814       if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD)
    815         continue;
    816       WorkList.push_back(*I);
    817     }
    818     SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG);
    819     for (SmallVector<SDNode*, 8>::iterator I = WorkList.begin(),
    820                                            E = WorkList.end(); I != E; ++I) {
    821       SmallVector<SDValue, 8> Ops;
    822       Ops.push_back(Chain);
    823       for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) {
    824         Ops.push_back((*I)->getOperand(i));
    825       }
    826       DAG.UpdateNodeOperands(*I, Ops);
    827     }
    828     return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
    829   }
    830   }
    831 
    832   const Function &Fn = *DAG.getMachineFunction().getFunction();
    833   DiagnosticInfoUnsupported BadInit(Fn,
    834                                     "initializer for address space");
    835   DAG.getContext()->diagnose(BadInit);
    836   return SDValue();
    837 }
    838 
    839 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
    840                                                   SelectionDAG &DAG) const {
    841   SmallVector<SDValue, 8> Args;
    842   SDValue A = Op.getOperand(0);
    843   SDValue B = Op.getOperand(1);
    844 
    845   DAG.ExtractVectorElements(A, Args);
    846   DAG.ExtractVectorElements(B, Args);
    847 
    848   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
    849 }
    850 
    851 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
    852                                                      SelectionDAG &DAG) const {
    853 
    854   SmallVector<SDValue, 8> Args;
    855   unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    856   EVT VT = Op.getValueType();
    857   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
    858                             VT.getVectorNumElements());
    859 
    860   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
    861 }
    862 
    863 SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
    864                                               SelectionDAG &DAG) const {
    865 
    866   MachineFunction &MF = DAG.getMachineFunction();
    867   const AMDGPUFrameLowering *TFL = Subtarget->getFrameLowering();
    868 
    869   FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
    870 
    871   unsigned FrameIndex = FIN->getIndex();
    872   unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
    873   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF),
    874                          Op.getValueType());
    875 }
    876 
    877 SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
    878     SelectionDAG &DAG) const {
    879   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    880   SDLoc DL(Op);
    881   EVT VT = Op.getValueType();
    882 
    883   switch (IntrinsicID) {
    884     default: return Op;
    885     case AMDGPUIntrinsic::AMDGPU_abs:
    886     case AMDGPUIntrinsic::AMDIL_abs: // Legacy name.
    887       return LowerIntrinsicIABS(Op, DAG);
    888     case AMDGPUIntrinsic::AMDGPU_lrp:
    889       return LowerIntrinsicLRP(Op, DAG);
    890 
    891     case AMDGPUIntrinsic::AMDGPU_clamp:
    892     case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name.
    893       return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
    894                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    895 
    896     case Intrinsic::AMDGPU_div_scale: {
    897       // 3rd parameter required to be a constant.
    898       const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
    899       if (!Param)
    900         return DAG.getUNDEF(VT);
    901 
    902       // Translate to the operands expected by the machine instruction. The
    903       // first parameter must be the same as the first instruction.
    904       SDValue Numerator = Op.getOperand(1);
    905       SDValue Denominator = Op.getOperand(2);
    906 
    907       // Note this order is opposite of the machine instruction's operations,
    908       // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
    909       // intrinsic has the numerator as the first operand to match a normal
    910       // division operation.
    911 
    912       SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
    913 
    914       return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
    915                          Denominator, Numerator);
    916     }
    917 
    918     case Intrinsic::AMDGPU_div_fmas:
    919       return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
    920                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
    921                          Op.getOperand(4));
    922 
    923     case Intrinsic::AMDGPU_div_fixup:
    924       return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
    925                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    926 
    927     case Intrinsic::AMDGPU_trig_preop:
    928       return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
    929                          Op.getOperand(1), Op.getOperand(2));
    930 
    931     case Intrinsic::AMDGPU_rcp:
    932       return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
    933 
    934     case Intrinsic::AMDGPU_rsq:
    935       return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
    936 
    937     case AMDGPUIntrinsic::AMDGPU_legacy_rsq:
    938       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
    939 
    940     case Intrinsic::AMDGPU_rsq_clamped:
    941       if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
    942         Type *Type = VT.getTypeForEVT(*DAG.getContext());
    943         APFloat Max = APFloat::getLargest(Type->getFltSemantics());
    944         APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
    945 
    946         SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
    947         SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
    948                                   DAG.getConstantFP(Max, VT));
    949         return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
    950                            DAG.getConstantFP(Min, VT));
    951       } else {
    952         return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
    953       }
    954 
    955     case Intrinsic::AMDGPU_ldexp:
    956       return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1),
    957                                                    Op.getOperand(2));
    958 
    959     case AMDGPUIntrinsic::AMDGPU_imax:
    960       return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1),
    961                                                   Op.getOperand(2));
    962     case AMDGPUIntrinsic::AMDGPU_umax:
    963       return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1),
    964                                                   Op.getOperand(2));
    965     case AMDGPUIntrinsic::AMDGPU_imin:
    966       return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1),
    967                                                   Op.getOperand(2));
    968     case AMDGPUIntrinsic::AMDGPU_umin:
    969       return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1),
    970                                                   Op.getOperand(2));
    971 
    972     case AMDGPUIntrinsic::AMDGPU_umul24:
    973       return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT,
    974                          Op.getOperand(1), Op.getOperand(2));
    975 
    976     case AMDGPUIntrinsic::AMDGPU_imul24:
    977       return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT,
    978                          Op.getOperand(1), Op.getOperand(2));
    979 
    980     case AMDGPUIntrinsic::AMDGPU_umad24:
    981       return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT,
    982                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    983 
    984     case AMDGPUIntrinsic::AMDGPU_imad24:
    985       return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT,
    986                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    987 
    988     case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0:
    989       return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1));
    990 
    991     case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1:
    992       return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1));
    993 
    994     case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2:
    995       return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1));
    996 
    997     case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3:
    998       return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1));
    999 
   1000     case AMDGPUIntrinsic::AMDGPU_bfe_i32:
   1001       return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
   1002                          Op.getOperand(1),
   1003                          Op.getOperand(2),
   1004                          Op.getOperand(3));
   1005 
   1006     case AMDGPUIntrinsic::AMDGPU_bfe_u32:
   1007       return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
   1008                          Op.getOperand(1),
   1009                          Op.getOperand(2),
   1010                          Op.getOperand(3));
   1011 
   1012     case AMDGPUIntrinsic::AMDGPU_bfi:
   1013       return DAG.getNode(AMDGPUISD::BFI, DL, VT,
   1014                          Op.getOperand(1),
   1015                          Op.getOperand(2),
   1016                          Op.getOperand(3));
   1017 
   1018     case AMDGPUIntrinsic::AMDGPU_bfm:
   1019       return DAG.getNode(AMDGPUISD::BFM, DL, VT,
   1020                          Op.getOperand(1),
   1021                          Op.getOperand(2));
   1022 
   1023     case AMDGPUIntrinsic::AMDGPU_brev:
   1024       return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1));
   1025 
   1026   case Intrinsic::AMDGPU_class:
   1027     return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
   1028                        Op.getOperand(1), Op.getOperand(2));
   1029 
   1030     case AMDGPUIntrinsic::AMDIL_exp: // Legacy name.
   1031       return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
   1032 
   1033     case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name.
   1034       return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
   1035     case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name.
   1036       return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1));
   1037   }
   1038 }
   1039 
   1040 ///IABS(a) = SMAX(sub(0, a), a)
   1041 SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
   1042                                                  SelectionDAG &DAG) const {
   1043   SDLoc DL(Op);
   1044   EVT VT = Op.getValueType();
   1045   SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
   1046                                               Op.getOperand(1));
   1047 
   1048   return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Neg, Op.getOperand(1));
   1049 }
   1050 
   1051 /// Linear Interpolation
   1052 /// LRP(a, b, c) = muladd(a,  b, (1 - a) * c)
   1053 SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
   1054                                                 SelectionDAG &DAG) const {
   1055   SDLoc DL(Op);
   1056   EVT VT = Op.getValueType();
   1057   SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
   1058                                 DAG.getConstantFP(1.0f, MVT::f32),
   1059                                 Op.getOperand(1));
   1060   SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA,
   1061                                                     Op.getOperand(3));
   1062   return DAG.getNode(ISD::FADD, DL, VT,
   1063       DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)),
   1064       OneSubAC);
   1065 }
   1066 
   1067 /// \brief Generate Min/Max node
   1068 SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL,
   1069                                                    EVT VT,
   1070                                                    SDValue LHS,
   1071                                                    SDValue RHS,
   1072                                                    SDValue True,
   1073                                                    SDValue False,
   1074                                                    SDValue CC,
   1075                                                    DAGCombinerInfo &DCI) const {
   1076   if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
   1077     return SDValue();
   1078 
   1079   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
   1080     return SDValue();
   1081 
   1082   SelectionDAG &DAG = DCI.DAG;
   1083   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
   1084   switch (CCOpcode) {
   1085   case ISD::SETOEQ:
   1086   case ISD::SETONE:
   1087   case ISD::SETUNE:
   1088   case ISD::SETNE:
   1089   case ISD::SETUEQ:
   1090   case ISD::SETEQ:
   1091   case ISD::SETFALSE:
   1092   case ISD::SETFALSE2:
   1093   case ISD::SETTRUE:
   1094   case ISD::SETTRUE2:
   1095   case ISD::SETUO:
   1096   case ISD::SETO:
   1097     break;
   1098   case ISD::SETULE:
   1099   case ISD::SETULT: {
   1100     if (LHS == True)
   1101       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
   1102     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
   1103   }
   1104   case ISD::SETOLE:
   1105   case ISD::SETOLT:
   1106   case ISD::SETLE:
   1107   case ISD::SETLT: {
   1108     // Ordered. Assume ordered for undefined.
   1109 
   1110     // Only do this after legalization to avoid interfering with other combines
   1111     // which might occur.
   1112     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
   1113         !DCI.isCalledByLegalizer())
   1114       return SDValue();
   1115 
   1116     // We need to permute the operands to get the correct NaN behavior. The
   1117     // selected operand is the second one based on the failing compare with NaN,
   1118     // so permute it based on the compare type the hardware uses.
   1119     if (LHS == True)
   1120       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
   1121     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
   1122   }
   1123   case ISD::SETUGE:
   1124   case ISD::SETUGT: {
   1125     if (LHS == True)
   1126       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
   1127     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
   1128   }
   1129   case ISD::SETGT:
   1130   case ISD::SETGE:
   1131   case ISD::SETOGE:
   1132   case ISD::SETOGT: {
   1133     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
   1134         !DCI.isCalledByLegalizer())
   1135       return SDValue();
   1136 
   1137     if (LHS == True)
   1138       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
   1139     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
   1140   }
   1141   case ISD::SETCC_INVALID:
   1142     llvm_unreachable("Invalid setcc condcode!");
   1143   }
   1144   return SDValue();
   1145 }
   1146 
   1147 /// \brief Generate Min/Max node
   1148 SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL,
   1149                                              EVT VT,
   1150                                              SDValue LHS,
   1151                                              SDValue RHS,
   1152                                              SDValue True,
   1153                                              SDValue False,
   1154                                              SDValue CC,
   1155                                              SelectionDAG &DAG) const {
   1156   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
   1157     return SDValue();
   1158 
   1159   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
   1160   switch (CCOpcode) {
   1161   case ISD::SETULE:
   1162   case ISD::SETULT: {
   1163     unsigned Opc = (LHS == True) ? AMDGPUISD::UMIN : AMDGPUISD::UMAX;
   1164     return DAG.getNode(Opc, DL, VT, LHS, RHS);
   1165   }
   1166   case ISD::SETLE:
   1167   case ISD::SETLT: {
   1168     unsigned Opc = (LHS == True) ? AMDGPUISD::SMIN : AMDGPUISD::SMAX;
   1169     return DAG.getNode(Opc, DL, VT, LHS, RHS);
   1170   }
   1171   case ISD::SETGT:
   1172   case ISD::SETGE: {
   1173     unsigned Opc = (LHS == True) ? AMDGPUISD::SMAX : AMDGPUISD::SMIN;
   1174     return DAG.getNode(Opc, DL, VT, LHS, RHS);
   1175   }
   1176   case ISD::SETUGE:
   1177   case ISD::SETUGT: {
   1178     unsigned Opc = (LHS == True) ? AMDGPUISD::UMAX : AMDGPUISD::UMIN;
   1179     return DAG.getNode(Opc, DL, VT, LHS, RHS);
   1180   }
   1181   default:
   1182     return SDValue();
   1183   }
   1184 }
   1185 
   1186 SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op,
   1187                                                   SelectionDAG &DAG) const {
   1188   LoadSDNode *Load = cast<LoadSDNode>(Op);
   1189   EVT MemVT = Load->getMemoryVT();
   1190   EVT MemEltVT = MemVT.getVectorElementType();
   1191 
   1192   EVT LoadVT = Op.getValueType();
   1193   EVT EltVT = LoadVT.getVectorElementType();
   1194   EVT PtrVT = Load->getBasePtr().getValueType();
   1195 
   1196   unsigned NumElts = Load->getMemoryVT().getVectorNumElements();
   1197   SmallVector<SDValue, 8> Loads;
   1198   SmallVector<SDValue, 8> Chains;
   1199 
   1200   SDLoc SL(Op);
   1201   unsigned MemEltSize = MemEltVT.getStoreSize();
   1202   MachinePointerInfo SrcValue(Load->getMemOperand()->getValue());
   1203 
   1204   for (unsigned i = 0; i < NumElts; ++i) {
   1205     SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(),
   1206                               DAG.getConstant(i * MemEltSize, PtrVT));
   1207 
   1208     SDValue NewLoad
   1209       = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
   1210                        Load->getChain(), Ptr,
   1211                        SrcValue.getWithOffset(i * MemEltSize),
   1212                        MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
   1213                        Load->isInvariant(), Load->getAlignment());
   1214     Loads.push_back(NewLoad.getValue(0));
   1215     Chains.push_back(NewLoad.getValue(1));
   1216   }
   1217 
   1218   SDValue Ops[] = {
   1219     DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads),
   1220     DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains)
   1221   };
   1222 
   1223   return DAG.getMergeValues(Ops, SL);
   1224 }
   1225 
   1226 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
   1227                                               SelectionDAG &DAG) const {
   1228   EVT VT = Op.getValueType();
   1229 
   1230   // If this is a 2 element vector, we really want to scalarize and not create
   1231   // weird 1 element vectors.
   1232   if (VT.getVectorNumElements() == 2)
   1233     return ScalarizeVectorLoad(Op, DAG);
   1234 
   1235   LoadSDNode *Load = cast<LoadSDNode>(Op);
   1236   SDValue BasePtr = Load->getBasePtr();
   1237   EVT PtrVT = BasePtr.getValueType();
   1238   EVT MemVT = Load->getMemoryVT();
   1239   SDLoc SL(Op);
   1240   MachinePointerInfo SrcValue(Load->getMemOperand()->getValue());
   1241 
   1242   EVT LoVT, HiVT;
   1243   EVT LoMemVT, HiMemVT;
   1244   SDValue Lo, Hi;
   1245 
   1246   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
   1247   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
   1248   std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
   1249   SDValue LoLoad
   1250     = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
   1251                      Load->getChain(), BasePtr,
   1252                      SrcValue,
   1253                      LoMemVT, Load->isVolatile(), Load->isNonTemporal(),
   1254                      Load->isInvariant(), Load->getAlignment());
   1255 
   1256   SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
   1257                               DAG.getConstant(LoMemVT.getStoreSize(), PtrVT));
   1258 
   1259   SDValue HiLoad
   1260     = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT,
   1261                      Load->getChain(), HiPtr,
   1262                      SrcValue.getWithOffset(LoMemVT.getStoreSize()),
   1263                      HiMemVT, Load->isVolatile(), Load->isNonTemporal(),
   1264                      Load->isInvariant(), Load->getAlignment());
   1265 
   1266   SDValue Ops[] = {
   1267     DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
   1268     DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
   1269                 LoLoad.getValue(1), HiLoad.getValue(1))
   1270   };
   1271 
   1272   return DAG.getMergeValues(Ops, SL);
   1273 }
   1274 
   1275 SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
   1276                                                SelectionDAG &DAG) const {
   1277   StoreSDNode *Store = cast<StoreSDNode>(Op);
   1278   EVT MemVT = Store->getMemoryVT();
   1279   unsigned MemBits = MemVT.getSizeInBits();
   1280 
   1281   // Byte stores are really expensive, so if possible, try to pack 32-bit vector
   1282   // truncating store into an i32 store.
   1283   // XXX: We could also handle optimize other vector bitwidths.
   1284   if (!MemVT.isVector() || MemBits > 32) {
   1285     return SDValue();
   1286   }
   1287 
   1288   SDLoc DL(Op);
   1289   SDValue Value = Store->getValue();
   1290   EVT VT = Value.getValueType();
   1291   EVT ElemVT = VT.getVectorElementType();
   1292   SDValue Ptr = Store->getBasePtr();
   1293   EVT MemEltVT = MemVT.getVectorElementType();
   1294   unsigned MemEltBits = MemEltVT.getSizeInBits();
   1295   unsigned MemNumElements = MemVT.getVectorNumElements();
   1296   unsigned PackedSize = MemVT.getStoreSizeInBits();
   1297   SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, MVT::i32);
   1298 
   1299   assert(Value.getValueType().getScalarSizeInBits() >= 32);
   1300 
   1301   SDValue PackedValue;
   1302   for (unsigned i = 0; i < MemNumElements; ++i) {
   1303     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value,
   1304                               DAG.getConstant(i, MVT::i32));
   1305     Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32);
   1306     Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg
   1307 
   1308     SDValue Shift = DAG.getConstant(MemEltBits * i, MVT::i32);
   1309     Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift);
   1310 
   1311     if (i == 0) {
   1312       PackedValue = Elt;
   1313     } else {
   1314       PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt);
   1315     }
   1316   }
   1317 
   1318   if (PackedSize < 32) {
   1319     EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize);
   1320     return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr,
   1321                              Store->getMemOperand()->getPointerInfo(),
   1322                              PackedVT,
   1323                              Store->isNonTemporal(), Store->isVolatile(),
   1324                              Store->getAlignment());
   1325   }
   1326 
   1327   return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
   1328                       Store->getMemOperand()->getPointerInfo(),
   1329                       Store->isVolatile(),  Store->isNonTemporal(),
   1330                       Store->getAlignment());
   1331 }
   1332 
   1333 SDValue AMDGPUTargetLowering::ScalarizeVectorStore(SDValue Op,
   1334                                                    SelectionDAG &DAG) const {
   1335   StoreSDNode *Store = cast<StoreSDNode>(Op);
   1336   EVT MemEltVT = Store->getMemoryVT().getVectorElementType();
   1337   EVT EltVT = Store->getValue().getValueType().getVectorElementType();
   1338   EVT PtrVT = Store->getBasePtr().getValueType();
   1339   unsigned NumElts = Store->getMemoryVT().getVectorNumElements();
   1340   SDLoc SL(Op);
   1341 
   1342   SmallVector<SDValue, 8> Chains;
   1343 
   1344   unsigned EltSize = MemEltVT.getStoreSize();
   1345   MachinePointerInfo SrcValue(Store->getMemOperand()->getValue());
   1346 
   1347   for (unsigned i = 0, e = NumElts; i != e; ++i) {
   1348     SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
   1349                               Store->getValue(),
   1350                               DAG.getConstant(i, MVT::i32));
   1351 
   1352     SDValue Offset = DAG.getConstant(i * MemEltVT.getStoreSize(), PtrVT);
   1353     SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Store->getBasePtr(), Offset);
   1354     SDValue NewStore =
   1355       DAG.getTruncStore(Store->getChain(), SL, Val, Ptr,
   1356                         SrcValue.getWithOffset(i * EltSize),
   1357                         MemEltVT, Store->isNonTemporal(), Store->isVolatile(),
   1358                         Store->getAlignment());
   1359     Chains.push_back(NewStore);
   1360   }
   1361 
   1362   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains);
   1363 }
   1364 
   1365 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
   1366                                                SelectionDAG &DAG) const {
   1367   StoreSDNode *Store = cast<StoreSDNode>(Op);
   1368   SDValue Val = Store->getValue();
   1369   EVT VT = Val.getValueType();
   1370 
   1371   // If this is a 2 element vector, we really want to scalarize and not create
   1372   // weird 1 element vectors.
   1373   if (VT.getVectorNumElements() == 2)
   1374     return ScalarizeVectorStore(Op, DAG);
   1375 
   1376   EVT MemVT = Store->getMemoryVT();
   1377   SDValue Chain = Store->getChain();
   1378   SDValue BasePtr = Store->getBasePtr();
   1379   SDLoc SL(Op);
   1380 
   1381   EVT LoVT, HiVT;
   1382   EVT LoMemVT, HiMemVT;
   1383   SDValue Lo, Hi;
   1384 
   1385   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
   1386   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
   1387   std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
   1388 
   1389   EVT PtrVT = BasePtr.getValueType();
   1390   SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
   1391                               DAG.getConstant(LoMemVT.getStoreSize(), PtrVT));
   1392 
   1393   MachinePointerInfo SrcValue(Store->getMemOperand()->getValue());
   1394   SDValue LoStore
   1395     = DAG.getTruncStore(Chain, SL, Lo,
   1396                         BasePtr,
   1397                         SrcValue,
   1398                         LoMemVT,
   1399                         Store->isNonTemporal(),
   1400                         Store->isVolatile(),
   1401                         Store->getAlignment());
   1402   SDValue HiStore
   1403     = DAG.getTruncStore(Chain, SL, Hi,
   1404                         HiPtr,
   1405                         SrcValue.getWithOffset(LoMemVT.getStoreSize()),
   1406                         HiMemVT,
   1407                         Store->isNonTemporal(),
   1408                         Store->isVolatile(),
   1409                         Store->getAlignment());
   1410 
   1411   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
   1412 }
   1413 
   1414 
   1415 SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   1416   SDLoc DL(Op);
   1417   LoadSDNode *Load = cast<LoadSDNode>(Op);
   1418   ISD::LoadExtType ExtType = Load->getExtensionType();
   1419   EVT VT = Op.getValueType();
   1420   EVT MemVT = Load->getMemoryVT();
   1421 
   1422   if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
   1423     assert(VT == MVT::i1 && "Only i1 non-extloads expected");
   1424     // FIXME: Copied from PPC
   1425     // First, load into 32 bits, then truncate to 1 bit.
   1426 
   1427     SDValue Chain = Load->getChain();
   1428     SDValue BasePtr = Load->getBasePtr();
   1429     MachineMemOperand *MMO = Load->getMemOperand();
   1430 
   1431     SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
   1432                                    BasePtr, MVT::i8, MMO);
   1433 
   1434     SDValue Ops[] = {
   1435       DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD),
   1436       NewLD.getValue(1)
   1437     };
   1438 
   1439     return DAG.getMergeValues(Ops, DL);
   1440   }
   1441 
   1442   if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS ||
   1443       Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS ||
   1444       ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32))
   1445     return SDValue();
   1446 
   1447 
   1448   SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
   1449                             DAG.getConstant(2, MVT::i32));
   1450   SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
   1451                             Load->getChain(), Ptr,
   1452                             DAG.getTargetConstant(0, MVT::i32),
   1453                             Op.getOperand(2));
   1454   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
   1455                                 Load->getBasePtr(),
   1456                                 DAG.getConstant(0x3, MVT::i32));
   1457   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
   1458                                  DAG.getConstant(3, MVT::i32));
   1459 
   1460   Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
   1461 
   1462   EVT MemEltVT = MemVT.getScalarType();
   1463   if (ExtType == ISD::SEXTLOAD) {
   1464     SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
   1465 
   1466     SDValue Ops[] = {
   1467       DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
   1468       Load->getChain()
   1469     };
   1470 
   1471     return DAG.getMergeValues(Ops, DL);
   1472   }
   1473 
   1474   SDValue Ops[] = {
   1475     DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
   1476     Load->getChain()
   1477   };
   1478 
   1479   return DAG.getMergeValues(Ops, DL);
   1480 }
   1481 
   1482 SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   1483   SDLoc DL(Op);
   1484   SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG);
   1485   if (Result.getNode()) {
   1486     return Result;
   1487   }
   1488 
   1489   StoreSDNode *Store = cast<StoreSDNode>(Op);
   1490   SDValue Chain = Store->getChain();
   1491   if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
   1492        Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
   1493       Store->getValue().getValueType().isVector()) {
   1494     return ScalarizeVectorStore(Op, DAG);
   1495   }
   1496 
   1497   EVT MemVT = Store->getMemoryVT();
   1498   if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
   1499       MemVT.bitsLT(MVT::i32)) {
   1500     unsigned Mask = 0;
   1501     if (Store->getMemoryVT() == MVT::i8) {
   1502       Mask = 0xff;
   1503     } else if (Store->getMemoryVT() == MVT::i16) {
   1504       Mask = 0xffff;
   1505     }
   1506     SDValue BasePtr = Store->getBasePtr();
   1507     SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
   1508                               DAG.getConstant(2, MVT::i32));
   1509     SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
   1510                               Chain, Ptr, DAG.getTargetConstant(0, MVT::i32));
   1511 
   1512     SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
   1513                                   DAG.getConstant(0x3, MVT::i32));
   1514 
   1515     SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
   1516                                    DAG.getConstant(3, MVT::i32));
   1517 
   1518     SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
   1519                                     Store->getValue());
   1520 
   1521     SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
   1522 
   1523     SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
   1524                                        MaskedValue, ShiftAmt);
   1525 
   1526     SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(Mask, MVT::i32),
   1527                                   ShiftAmt);
   1528     DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
   1529                           DAG.getConstant(0xffffffff, MVT::i32));
   1530     Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
   1531 
   1532     SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
   1533     return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
   1534                        Chain, Value, Ptr, DAG.getTargetConstant(0, MVT::i32));
   1535   }
   1536   return SDValue();
   1537 }
   1538 
   1539 // This is a shortcut for integer division because we have fast i32<->f32
   1540 // conversions, and fast f32 reciprocal instructions. The fractional part of a
   1541 // float is enough to accurately represent up to a 24-bit integer.
   1542 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const {
   1543   SDLoc DL(Op);
   1544   EVT VT = Op.getValueType();
   1545   SDValue LHS = Op.getOperand(0);
   1546   SDValue RHS = Op.getOperand(1);
   1547   MVT IntVT = MVT::i32;
   1548   MVT FltVT = MVT::f32;
   1549 
   1550   ISD::NodeType ToFp  = sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
   1551   ISD::NodeType ToInt = sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
   1552 
   1553   if (VT.isVector()) {
   1554     unsigned NElts = VT.getVectorNumElements();
   1555     IntVT = MVT::getVectorVT(MVT::i32, NElts);
   1556     FltVT = MVT::getVectorVT(MVT::f32, NElts);
   1557   }
   1558 
   1559   unsigned BitSize = VT.getScalarType().getSizeInBits();
   1560 
   1561   SDValue jq = DAG.getConstant(1, IntVT);
   1562 
   1563   if (sign) {
   1564     // char|short jq = ia ^ ib;
   1565     jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
   1566 
   1567     // jq = jq >> (bitsize - 2)
   1568     jq = DAG.getNode(ISD::SRA, DL, VT, jq, DAG.getConstant(BitSize - 2, VT));
   1569 
   1570     // jq = jq | 0x1
   1571     jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, VT));
   1572 
   1573     // jq = (int)jq
   1574     jq = DAG.getSExtOrTrunc(jq, DL, IntVT);
   1575   }
   1576 
   1577   // int ia = (int)LHS;
   1578   SDValue ia = sign ?
   1579     DAG.getSExtOrTrunc(LHS, DL, IntVT) : DAG.getZExtOrTrunc(LHS, DL, IntVT);
   1580 
   1581   // int ib, (int)RHS;
   1582   SDValue ib = sign ?
   1583     DAG.getSExtOrTrunc(RHS, DL, IntVT) : DAG.getZExtOrTrunc(RHS, DL, IntVT);
   1584 
   1585   // float fa = (float)ia;
   1586   SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
   1587 
   1588   // float fb = (float)ib;
   1589   SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
   1590 
   1591   // float fq = native_divide(fa, fb);
   1592   SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
   1593                            fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
   1594 
   1595   // fq = trunc(fq);
   1596   fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
   1597 
   1598   // float fqneg = -fq;
   1599   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
   1600 
   1601   // float fr = mad(fqneg, fb, fa);
   1602   SDValue fr = DAG.getNode(ISD::FADD, DL, FltVT,
   1603                            DAG.getNode(ISD::FMUL, DL, FltVT, fqneg, fb), fa);
   1604 
   1605   // int iq = (int)fq;
   1606   SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
   1607 
   1608   // fr = fabs(fr);
   1609   fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
   1610 
   1611   // fb = fabs(fb);
   1612   fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
   1613 
   1614   EVT SetCCVT = getSetCCResultType(*DAG.getContext(), VT);
   1615 
   1616   // int cv = fr >= fb;
   1617   SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
   1618 
   1619   // jq = (cv ? jq : 0);
   1620   jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, VT));
   1621 
   1622   // dst = trunc/extend to legal type
   1623   iq = sign ? DAG.getSExtOrTrunc(iq, DL, VT) : DAG.getZExtOrTrunc(iq, DL, VT);
   1624 
   1625   // dst = iq + jq;
   1626   SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
   1627 
   1628   // Rem needs compensation, it's easier to recompute it
   1629   SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
   1630   Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
   1631 
   1632   SDValue Res[2] = {
   1633     Div,
   1634     Rem
   1635   };
   1636   return DAG.getMergeValues(Res, DL);
   1637 }
   1638 
   1639 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
   1640                                       SelectionDAG &DAG,
   1641                                       SmallVectorImpl<SDValue> &Results) const {
   1642   assert(Op.getValueType() == MVT::i64);
   1643 
   1644   SDLoc DL(Op);
   1645   EVT VT = Op.getValueType();
   1646   EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
   1647 
   1648   SDValue one = DAG.getConstant(1, HalfVT);
   1649   SDValue zero = DAG.getConstant(0, HalfVT);
   1650 
   1651   //HiLo split
   1652   SDValue LHS = Op.getOperand(0);
   1653   SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
   1654   SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
   1655 
   1656   SDValue RHS = Op.getOperand(1);
   1657   SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
   1658   SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
   1659 
   1660   if (VT == MVT::i64 &&
   1661     DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
   1662     DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
   1663 
   1664     SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
   1665                               LHS_Lo, RHS_Lo);
   1666 
   1667     SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(0), zero);
   1668     SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(1), zero);
   1669     Results.push_back(DIV);
   1670     Results.push_back(REM);
   1671     return;
   1672   }
   1673 
   1674   // Get Speculative values
   1675   SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
   1676   SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
   1677 
   1678   SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
   1679   SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, zero);
   1680 
   1681   SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
   1682   SDValue DIV_Lo = zero;
   1683 
   1684   const unsigned halfBitWidth = HalfVT.getSizeInBits();
   1685 
   1686   for (unsigned i = 0; i < halfBitWidth; ++i) {
   1687     const unsigned bitPos = halfBitWidth - i - 1;
   1688     SDValue POS = DAG.getConstant(bitPos, HalfVT);
   1689     // Get value of high bit
   1690     SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
   1691     HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
   1692     HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
   1693 
   1694     // Shift
   1695     REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, VT));
   1696     // Add LHS high bit
   1697     REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
   1698 
   1699     SDValue BIT = DAG.getConstant(1 << bitPos, HalfVT);
   1700     SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE);
   1701 
   1702     DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
   1703 
   1704     // Update REM
   1705     SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
   1706     REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
   1707   }
   1708 
   1709   SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
   1710   Results.push_back(DIV);
   1711   Results.push_back(REM);
   1712 }
   1713 
   1714 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
   1715                                            SelectionDAG &DAG) const {
   1716   SDLoc DL(Op);
   1717   EVT VT = Op.getValueType();
   1718 
   1719   if (VT == MVT::i64) {
   1720     SmallVector<SDValue, 2> Results;
   1721     LowerUDIVREM64(Op, DAG, Results);
   1722     return DAG.getMergeValues(Results, DL);
   1723   }
   1724 
   1725   SDValue Num = Op.getOperand(0);
   1726   SDValue Den = Op.getOperand(1);
   1727 
   1728   if (VT == MVT::i32) {
   1729     if (DAG.MaskedValueIsZero(Num, APInt::getHighBitsSet(32, 8)) &&
   1730         DAG.MaskedValueIsZero(Den, APInt::getHighBitsSet(32, 8))) {
   1731       // TODO: We technically could do this for i64, but shouldn't that just be
   1732       // handled by something generally reducing 64-bit division on 32-bit
   1733       // values to 32-bit?
   1734       return LowerDIVREM24(Op, DAG, false);
   1735     }
   1736   }
   1737 
   1738   // RCP =  URECIP(Den) = 2^32 / Den + e
   1739   // e is rounding error.
   1740   SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
   1741 
   1742   // RCP_LO = mul(RCP, Den) */
   1743   SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
   1744 
   1745   // RCP_HI = mulhu (RCP, Den) */
   1746   SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
   1747 
   1748   // NEG_RCP_LO = -RCP_LO
   1749   SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
   1750                                                      RCP_LO);
   1751 
   1752   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
   1753   SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
   1754                                            NEG_RCP_LO, RCP_LO,
   1755                                            ISD::SETEQ);
   1756   // Calculate the rounding error from the URECIP instruction
   1757   // E = mulhu(ABS_RCP_LO, RCP)
   1758   SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
   1759 
   1760   // RCP_A_E = RCP + E
   1761   SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
   1762 
   1763   // RCP_S_E = RCP - E
   1764   SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
   1765 
   1766   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
   1767   SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
   1768                                      RCP_A_E, RCP_S_E,
   1769                                      ISD::SETEQ);
   1770   // Quotient = mulhu(Tmp0, Num)
   1771   SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
   1772 
   1773   // Num_S_Remainder = Quotient * Den
   1774   SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
   1775 
   1776   // Remainder = Num - Num_S_Remainder
   1777   SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
   1778 
   1779   // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
   1780   SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
   1781                                                  DAG.getConstant(-1, VT),
   1782                                                  DAG.getConstant(0, VT),
   1783                                                  ISD::SETUGE);
   1784   // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
   1785   SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
   1786                                                   Num_S_Remainder,
   1787                                                   DAG.getConstant(-1, VT),
   1788                                                   DAG.getConstant(0, VT),
   1789                                                   ISD::SETUGE);
   1790   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
   1791   SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
   1792                                                Remainder_GE_Zero);
   1793 
   1794   // Calculate Division result:
   1795 
   1796   // Quotient_A_One = Quotient + 1
   1797   SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
   1798                                                          DAG.getConstant(1, VT));
   1799 
   1800   // Quotient_S_One = Quotient - 1
   1801   SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
   1802                                                          DAG.getConstant(1, VT));
   1803 
   1804   // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
   1805   SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
   1806                                      Quotient, Quotient_A_One, ISD::SETEQ);
   1807 
   1808   // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
   1809   Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
   1810                             Quotient_S_One, Div, ISD::SETEQ);
   1811 
   1812   // Calculate Rem result:
   1813 
   1814   // Remainder_S_Den = Remainder - Den
   1815   SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
   1816 
   1817   // Remainder_A_Den = Remainder + Den
   1818   SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
   1819 
   1820   // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
   1821   SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
   1822                                     Remainder, Remainder_S_Den, ISD::SETEQ);
   1823 
   1824   // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
   1825   Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
   1826                             Remainder_A_Den, Rem, ISD::SETEQ);
   1827   SDValue Ops[2] = {
   1828     Div,
   1829     Rem
   1830   };
   1831   return DAG.getMergeValues(Ops, DL);
   1832 }
   1833 
   1834 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
   1835                                            SelectionDAG &DAG) const {
   1836   SDLoc DL(Op);
   1837   EVT VT = Op.getValueType();
   1838 
   1839   SDValue LHS = Op.getOperand(0);
   1840   SDValue RHS = Op.getOperand(1);
   1841 
   1842   SDValue Zero = DAG.getConstant(0, VT);
   1843   SDValue NegOne = DAG.getConstant(-1, VT);
   1844 
   1845   if (VT == MVT::i32 &&
   1846       DAG.ComputeNumSignBits(LHS) > 8 &&
   1847       DAG.ComputeNumSignBits(RHS) > 8) {
   1848     return LowerDIVREM24(Op, DAG, true);
   1849   }
   1850   if (VT == MVT::i64 &&
   1851       DAG.ComputeNumSignBits(LHS) > 32 &&
   1852       DAG.ComputeNumSignBits(RHS) > 32) {
   1853     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
   1854 
   1855     //HiLo split
   1856     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
   1857     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
   1858     SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
   1859                                  LHS_Lo, RHS_Lo);
   1860     SDValue Res[2] = {
   1861       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
   1862       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
   1863     };
   1864     return DAG.getMergeValues(Res, DL);
   1865   }
   1866 
   1867   SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
   1868   SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
   1869   SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
   1870   SDValue RSign = LHSign; // Remainder sign is the same as LHS
   1871 
   1872   LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
   1873   RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
   1874 
   1875   LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
   1876   RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
   1877 
   1878   SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
   1879   SDValue Rem = Div.getValue(1);
   1880 
   1881   Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
   1882   Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
   1883 
   1884   Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
   1885   Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
   1886 
   1887   SDValue Res[2] = {
   1888     Div,
   1889     Rem
   1890   };
   1891   return DAG.getMergeValues(Res, DL);
   1892 }
   1893 
   1894 // (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
   1895 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
   1896   SDLoc SL(Op);
   1897   EVT VT = Op.getValueType();
   1898   SDValue X = Op.getOperand(0);
   1899   SDValue Y = Op.getOperand(1);
   1900 
   1901   SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
   1902   SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
   1903   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
   1904 
   1905   return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
   1906 }
   1907 
   1908 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
   1909   SDLoc SL(Op);
   1910   SDValue Src = Op.getOperand(0);
   1911 
   1912   // result = trunc(src)
   1913   // if (src > 0.0 && src != result)
   1914   //   result += 1.0
   1915 
   1916   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
   1917 
   1918   const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64);
   1919   const SDValue One = DAG.getConstantFP(1.0, MVT::f64);
   1920 
   1921   EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
   1922 
   1923   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
   1924   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
   1925   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
   1926 
   1927   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
   1928   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
   1929 }
   1930 
   1931 static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) {
   1932   const unsigned FractBits = 52;
   1933   const unsigned ExpBits = 11;
   1934 
   1935   SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
   1936                                 Hi,
   1937                                 DAG.getConstant(FractBits - 32, MVT::i32),
   1938                                 DAG.getConstant(ExpBits, MVT::i32));
   1939   SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
   1940                             DAG.getConstant(1023, MVT::i32));
   1941 
   1942   return Exp;
   1943 }
   1944 
   1945 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
   1946   SDLoc SL(Op);
   1947   SDValue Src = Op.getOperand(0);
   1948 
   1949   assert(Op.getValueType() == MVT::f64);
   1950 
   1951   const SDValue Zero = DAG.getConstant(0, MVT::i32);
   1952   const SDValue One = DAG.getConstant(1, MVT::i32);
   1953 
   1954   SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
   1955 
   1956   // Extract the upper half, since this is where we will find the sign and
   1957   // exponent.
   1958   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
   1959 
   1960   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
   1961 
   1962   const unsigned FractBits = 52;
   1963 
   1964   // Extract the sign bit.
   1965   const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32);
   1966   SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
   1967 
   1968   // Extend back to to 64-bits.
   1969   SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
   1970                                   Zero, SignBit);
   1971   SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
   1972 
   1973   SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
   1974   const SDValue FractMask
   1975     = DAG.getConstant((UINT64_C(1) << FractBits) - 1, MVT::i64);
   1976 
   1977   SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
   1978   SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
   1979   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
   1980 
   1981   EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
   1982 
   1983   const SDValue FiftyOne = DAG.getConstant(FractBits - 1, MVT::i32);
   1984 
   1985   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
   1986   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
   1987 
   1988   SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
   1989   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
   1990 
   1991   return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
   1992 }
   1993 
   1994 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
   1995   SDLoc SL(Op);
   1996   SDValue Src = Op.getOperand(0);
   1997 
   1998   assert(Op.getValueType() == MVT::f64);
   1999 
   2000   APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52");
   2001   SDValue C1 = DAG.getConstantFP(C1Val, MVT::f64);
   2002   SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
   2003 
   2004   SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
   2005   SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
   2006 
   2007   SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
   2008 
   2009   APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51");
   2010   SDValue C2 = DAG.getConstantFP(C2Val, MVT::f64);
   2011 
   2012   EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
   2013   SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
   2014 
   2015   return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
   2016 }
   2017 
   2018 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
   2019   // FNEARBYINT and FRINT are the same, except in their handling of FP
   2020   // exceptions. Those aren't really meaningful for us, and OpenCL only has
   2021   // rint, so just treat them as equivalent.
   2022   return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
   2023 }
   2024 
   2025 // XXX - May require not supporting f32 denormals?
   2026 SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const {
   2027   SDLoc SL(Op);
   2028   SDValue X = Op.getOperand(0);
   2029 
   2030   SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
   2031 
   2032   SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
   2033 
   2034   SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
   2035 
   2036   const SDValue Zero = DAG.getConstantFP(0.0, MVT::f32);
   2037   const SDValue One = DAG.getConstantFP(1.0, MVT::f32);
   2038   const SDValue Half = DAG.getConstantFP(0.5, MVT::f32);
   2039 
   2040   SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X);
   2041 
   2042   EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
   2043 
   2044   SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
   2045 
   2046   SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero);
   2047 
   2048   return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel);
   2049 }
   2050 
   2051 SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
   2052   SDLoc SL(Op);
   2053   SDValue X = Op.getOperand(0);
   2054 
   2055   SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
   2056 
   2057   const SDValue Zero = DAG.getConstant(0, MVT::i32);
   2058   const SDValue One = DAG.getConstant(1, MVT::i32);
   2059   const SDValue NegOne = DAG.getConstant(-1, MVT::i32);
   2060   const SDValue FiftyOne = DAG.getConstant(51, MVT::i32);
   2061   EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
   2062 
   2063 
   2064   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
   2065 
   2066   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
   2067 
   2068   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
   2069 
   2070   const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), MVT::i64);
   2071 
   2072   SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
   2073   SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
   2074                           DAG.getConstant(INT64_C(0x0008000000000000), MVT::i64),
   2075                           Exp);
   2076 
   2077   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
   2078   SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
   2079                               DAG.getConstant(0, MVT::i64), Tmp0,
   2080                               ISD::SETNE);
   2081 
   2082   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
   2083                              D, DAG.getConstant(0, MVT::i64));
   2084   SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
   2085 
   2086   K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
   2087   K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
   2088 
   2089   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
   2090   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
   2091   SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
   2092 
   2093   SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
   2094                             ExpEqNegOne,
   2095                             DAG.getConstantFP(1.0, MVT::f64),
   2096                             DAG.getConstantFP(0.0, MVT::f64));
   2097 
   2098   SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
   2099 
   2100   K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
   2101   K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
   2102 
   2103   return K;
   2104 }
   2105 
   2106 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
   2107   EVT VT = Op.getValueType();
   2108 
   2109   if (VT == MVT::f32)
   2110     return LowerFROUND32(Op, DAG);
   2111 
   2112   if (VT == MVT::f64)
   2113     return LowerFROUND64(Op, DAG);
   2114 
   2115   llvm_unreachable("unhandled type");
   2116 }
   2117 
   2118 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
   2119   SDLoc SL(Op);
   2120   SDValue Src = Op.getOperand(0);
   2121 
   2122   // result = trunc(src);
   2123   // if (src < 0.0 && src != result)
   2124   //   result += -1.0.
   2125 
   2126   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
   2127 
   2128   const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64);
   2129   const SDValue NegOne = DAG.getConstantFP(-1.0, MVT::f64);
   2130 
   2131   EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
   2132 
   2133   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
   2134   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
   2135   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
   2136 
   2137   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
   2138   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
   2139 }
   2140 
   2141 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
   2142                                                bool Signed) const {
   2143   SDLoc SL(Op);
   2144   SDValue Src = Op.getOperand(0);
   2145 
   2146   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
   2147 
   2148   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
   2149                            DAG.getConstant(0, MVT::i32));
   2150   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
   2151                            DAG.getConstant(1, MVT::i32));
   2152 
   2153   SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
   2154                               SL, MVT::f64, Hi);
   2155 
   2156   SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
   2157 
   2158   SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
   2159                               DAG.getConstant(32, MVT::i32));
   2160 
   2161   return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
   2162 }
   2163 
   2164 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
   2165                                                SelectionDAG &DAG) const {
   2166   SDValue S0 = Op.getOperand(0);
   2167   if (S0.getValueType() != MVT::i64)
   2168     return SDValue();
   2169 
   2170   EVT DestVT = Op.getValueType();
   2171   if (DestVT == MVT::f64)
   2172     return LowerINT_TO_FP64(Op, DAG, false);
   2173 
   2174   assert(DestVT == MVT::f32);
   2175 
   2176   SDLoc DL(Op);
   2177 
   2178   // f32 uint_to_fp i64
   2179   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
   2180                            DAG.getConstant(0, MVT::i32));
   2181   SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo);
   2182   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
   2183                            DAG.getConstant(1, MVT::i32));
   2184   SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi);
   2185   FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi,
   2186                         DAG.getConstantFP(4294967296.0f, MVT::f32)); // 2^32
   2187   return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);
   2188 }
   2189 
   2190 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
   2191                                               SelectionDAG &DAG) const {
   2192   SDValue Src = Op.getOperand(0);
   2193   if (Src.getValueType() == MVT::i64 && Op.getValueType() == MVT::f64)
   2194     return LowerINT_TO_FP64(Op, DAG, true);
   2195 
   2196   return SDValue();
   2197 }
   2198 
   2199 SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
   2200                                                bool Signed) const {
   2201   SDLoc SL(Op);
   2202 
   2203   SDValue Src = Op.getOperand(0);
   2204 
   2205   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
   2206 
   2207   SDValue K0
   2208     = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), MVT::f64);
   2209   SDValue K1
   2210     = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), MVT::f64);
   2211 
   2212   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
   2213 
   2214   SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
   2215 
   2216 
   2217   SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
   2218 
   2219   SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
   2220                            MVT::i32, FloorMul);
   2221   SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
   2222 
   2223   SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Lo, Hi);
   2224 
   2225   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
   2226 }
   2227 
   2228 SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
   2229                                               SelectionDAG &DAG) const {
   2230   SDValue Src = Op.getOperand(0);
   2231 
   2232   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
   2233     return LowerFP64_TO_INT(Op, DAG, true);
   2234 
   2235   return SDValue();
   2236 }
   2237 
   2238 SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
   2239                                               SelectionDAG &DAG) const {
   2240   SDValue Src = Op.getOperand(0);
   2241 
   2242   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
   2243     return LowerFP64_TO_INT(Op, DAG, false);
   2244 
   2245   return SDValue();
   2246 }
   2247 
   2248 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
   2249                                                      SelectionDAG &DAG) const {
   2250   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
   2251   MVT VT = Op.getSimpleValueType();
   2252   MVT ScalarVT = VT.getScalarType();
   2253 
   2254   if (!VT.isVector())
   2255     return SDValue();
   2256 
   2257   SDValue Src = Op.getOperand(0);
   2258   SDLoc DL(Op);
   2259 
   2260   // TODO: Don't scalarize on Evergreen?
   2261   unsigned NElts = VT.getVectorNumElements();
   2262   SmallVector<SDValue, 8> Args;
   2263   DAG.ExtractVectorElements(Src, Args, 0, NElts);
   2264 
   2265   SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
   2266   for (unsigned I = 0; I < NElts; ++I)
   2267     Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
   2268 
   2269   return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args);
   2270 }
   2271 
   2272 //===----------------------------------------------------------------------===//
   2273 // Custom DAG optimizations
   2274 //===----------------------------------------------------------------------===//
   2275 
   2276 static bool isU24(SDValue Op, SelectionDAG &DAG) {
   2277   APInt KnownZero, KnownOne;
   2278   EVT VT = Op.getValueType();
   2279   DAG.computeKnownBits(Op, KnownZero, KnownOne);
   2280 
   2281   return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;
   2282 }
   2283 
   2284 static bool isI24(SDValue Op, SelectionDAG &DAG) {
   2285   EVT VT = Op.getValueType();
   2286 
   2287   // In order for this to be a signed 24-bit value, bit 23, must
   2288   // be a sign bit.
   2289   return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
   2290                                      // as unsigned 24-bit values.
   2291          (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
   2292 }
   2293 
   2294 static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) {
   2295 
   2296   SelectionDAG &DAG = DCI.DAG;
   2297   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   2298   EVT VT = Op.getValueType();
   2299 
   2300   APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
   2301   APInt KnownZero, KnownOne;
   2302   TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
   2303   if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
   2304     DCI.CommitTargetLoweringOpt(TLO);
   2305 }
   2306 
   2307 template <typename IntTy>
   2308 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
   2309                                uint32_t Offset, uint32_t Width) {
   2310   if (Width + Offset < 32) {
   2311     uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
   2312     IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
   2313     return DAG.getConstant(Result, MVT::i32);
   2314   }
   2315 
   2316   return DAG.getConstant(Src0 >> Offset, MVT::i32);
   2317 }
   2318 
   2319 static bool usesAllNormalStores(SDNode *LoadVal) {
   2320   for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) {
   2321     if (!ISD::isNormalStore(*I))
   2322       return false;
   2323   }
   2324 
   2325   return true;
   2326 }
   2327 
   2328 // If we have a copy of an illegal type, replace it with a load / store of an
   2329 // equivalently sized legal type. This avoids intermediate bit pack / unpack
   2330 // instructions emitted when handling extloads and truncstores. Ideally we could
   2331 // recognize the pack / unpack pattern to eliminate it.
   2332 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
   2333                                                   DAGCombinerInfo &DCI) const {
   2334   if (!DCI.isBeforeLegalize())
   2335     return SDValue();
   2336 
   2337   StoreSDNode *SN = cast<StoreSDNode>(N);
   2338   SDValue Value = SN->getValue();
   2339   EVT VT = Value.getValueType();
   2340 
   2341   if (isTypeLegal(VT) || SN->isVolatile() ||
   2342       !ISD::isNormalLoad(Value.getNode()) || VT.getSizeInBits() < 8)
   2343     return SDValue();
   2344 
   2345   LoadSDNode *LoadVal = cast<LoadSDNode>(Value);
   2346   if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal))
   2347     return SDValue();
   2348 
   2349   EVT MemVT = LoadVal->getMemoryVT();
   2350 
   2351   SDLoc SL(N);
   2352   SelectionDAG &DAG = DCI.DAG;
   2353   EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT);
   2354 
   2355   SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
   2356                                 LoadVT, SL,
   2357                                 LoadVal->getChain(),
   2358                                 LoadVal->getBasePtr(),
   2359                                 LoadVal->getOffset(),
   2360                                 LoadVT,
   2361                                 LoadVal->getMemOperand());
   2362 
   2363   SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0));
   2364   DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false);
   2365 
   2366   return DAG.getStore(SN->getChain(), SL, NewLoad,
   2367                       SN->getBasePtr(), SN->getMemOperand());
   2368 }
   2369 
   2370 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
   2371                                                 DAGCombinerInfo &DCI) const {
   2372   EVT VT = N->getValueType(0);
   2373 
   2374   if (VT.isVector() || VT.getSizeInBits() > 32)
   2375     return SDValue();
   2376 
   2377   SelectionDAG &DAG = DCI.DAG;
   2378   SDLoc DL(N);
   2379 
   2380   SDValue N0 = N->getOperand(0);
   2381   SDValue N1 = N->getOperand(1);
   2382   SDValue Mul;
   2383 
   2384   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
   2385     N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
   2386     N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
   2387     Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
   2388   } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
   2389     N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
   2390     N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
   2391     Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
   2392   } else {
   2393     return SDValue();
   2394   }
   2395 
   2396   // We need to use sext even for MUL_U24, because MUL_U24 is used
   2397   // for signed multiply of 8 and 16-bit types.
   2398   return DAG.getSExtOrTrunc(Mul, DL, VT);
   2399 }
   2400 
   2401 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
   2402                                                 DAGCombinerInfo &DCI) const {
   2403   SelectionDAG &DAG = DCI.DAG;
   2404   SDLoc DL(N);
   2405 
   2406   switch(N->getOpcode()) {
   2407     default: break;
   2408     case ISD::MUL:
   2409       return performMulCombine(N, DCI);
   2410     case AMDGPUISD::MUL_I24:
   2411     case AMDGPUISD::MUL_U24: {
   2412       SDValue N0 = N->getOperand(0);
   2413       SDValue N1 = N->getOperand(1);
   2414       simplifyI24(N0, DCI);
   2415       simplifyI24(N1, DCI);
   2416       return SDValue();
   2417     }
   2418   case ISD::SELECT: {
   2419     SDValue Cond = N->getOperand(0);
   2420     if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) {
   2421       SDLoc DL(N);
   2422       EVT VT = N->getValueType(0);
   2423       SDValue LHS = Cond.getOperand(0);
   2424       SDValue RHS = Cond.getOperand(1);
   2425       SDValue CC = Cond.getOperand(2);
   2426 
   2427       SDValue True = N->getOperand(1);
   2428       SDValue False = N->getOperand(2);
   2429 
   2430       if (VT == MVT::f32)
   2431         return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
   2432 
   2433       // TODO: Implement min / max Evergreen instructions.
   2434       if (VT == MVT::i32 &&
   2435           Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
   2436         return CombineIMinMax(DL, VT, LHS, RHS, True, False, CC, DAG);
   2437       }
   2438     }
   2439 
   2440     break;
   2441   }
   2442   case AMDGPUISD::BFE_I32:
   2443   case AMDGPUISD::BFE_U32: {
   2444     assert(!N->getValueType(0).isVector() &&
   2445            "Vector handling of BFE not implemented");
   2446     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
   2447     if (!Width)
   2448       break;
   2449 
   2450     uint32_t WidthVal = Width->getZExtValue() & 0x1f;
   2451     if (WidthVal == 0)
   2452       return DAG.getConstant(0, MVT::i32);
   2453 
   2454     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
   2455     if (!Offset)
   2456       break;
   2457 
   2458     SDValue BitsFrom = N->getOperand(0);
   2459     uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
   2460 
   2461     bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
   2462 
   2463     if (OffsetVal == 0) {
   2464       // This is already sign / zero extended, so try to fold away extra BFEs.
   2465       unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
   2466 
   2467       unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
   2468       if (OpSignBits >= SignBits)
   2469         return BitsFrom;
   2470 
   2471       EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
   2472       if (Signed) {
   2473         // This is a sign_extend_inreg. Replace it to take advantage of existing
   2474         // DAG Combines. If not eliminated, we will match back to BFE during
   2475         // selection.
   2476 
   2477         // TODO: The sext_inreg of extended types ends, although we can could
   2478         // handle them in a single BFE.
   2479         return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
   2480                            DAG.getValueType(SmallVT));
   2481       }
   2482 
   2483       return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
   2484     }
   2485 
   2486     if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
   2487       if (Signed) {
   2488         return constantFoldBFE<int32_t>(DAG,
   2489                                         CVal->getSExtValue(),
   2490                                         OffsetVal,
   2491                                         WidthVal);
   2492       }
   2493 
   2494       return constantFoldBFE<uint32_t>(DAG,
   2495                                        CVal->getZExtValue(),
   2496                                        OffsetVal,
   2497                                        WidthVal);
   2498     }
   2499 
   2500     if ((OffsetVal + WidthVal) >= 32) {
   2501       SDValue ShiftVal = DAG.getConstant(OffsetVal, MVT::i32);
   2502       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
   2503                          BitsFrom, ShiftVal);
   2504     }
   2505 
   2506     if (BitsFrom.hasOneUse()) {
   2507       APInt Demanded = APInt::getBitsSet(32,
   2508                                          OffsetVal,
   2509                                          OffsetVal + WidthVal);
   2510 
   2511       APInt KnownZero, KnownOne;
   2512       TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
   2513                                             !DCI.isBeforeLegalizeOps());
   2514       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   2515       if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
   2516           TLI.SimplifyDemandedBits(BitsFrom, Demanded,
   2517                                    KnownZero, KnownOne, TLO)) {
   2518         DCI.CommitTargetLoweringOpt(TLO);
   2519       }
   2520     }
   2521 
   2522     break;
   2523   }
   2524 
   2525   case ISD::STORE:
   2526     return performStoreCombine(N, DCI);
   2527   }
   2528   return SDValue();
   2529 }
   2530 
   2531 //===----------------------------------------------------------------------===//
   2532 // Helper functions
   2533 //===----------------------------------------------------------------------===//
   2534 
   2535 void AMDGPUTargetLowering::getOriginalFunctionArgs(
   2536                                SelectionDAG &DAG,
   2537                                const Function *F,
   2538                                const SmallVectorImpl<ISD::InputArg> &Ins,
   2539                                SmallVectorImpl<ISD::InputArg> &OrigIns) const {
   2540 
   2541   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
   2542     if (Ins[i].ArgVT == Ins[i].VT) {
   2543       OrigIns.push_back(Ins[i]);
   2544       continue;
   2545     }
   2546 
   2547     EVT VT;
   2548     if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) {
   2549       // Vector has been split into scalars.
   2550       VT = Ins[i].ArgVT.getVectorElementType();
   2551     } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() &&
   2552                Ins[i].ArgVT.getVectorElementType() !=
   2553                Ins[i].VT.getVectorElementType()) {
   2554       // Vector elements have been promoted
   2555       VT = Ins[i].ArgVT;
   2556     } else {
   2557       // Vector has been spilt into smaller vectors.
   2558       VT = Ins[i].VT;
   2559     }
   2560 
   2561     ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used,
   2562                       Ins[i].OrigArgIndex, Ins[i].PartOffset);
   2563     OrigIns.push_back(Arg);
   2564   }
   2565 }
   2566 
   2567 bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
   2568   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
   2569     return CFP->isExactlyValue(1.0);
   2570   }
   2571   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   2572     return C->isAllOnesValue();
   2573   }
   2574   return false;
   2575 }
   2576 
   2577 bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const {
   2578   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
   2579     return CFP->getValueAPF().isZero();
   2580   }
   2581   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   2582     return C->isNullValue();
   2583   }
   2584   return false;
   2585 }
   2586 
   2587 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
   2588                                                   const TargetRegisterClass *RC,
   2589                                                    unsigned Reg, EVT VT) const {
   2590   MachineFunction &MF = DAG.getMachineFunction();
   2591   MachineRegisterInfo &MRI = MF.getRegInfo();
   2592   unsigned VirtualRegister;
   2593   if (!MRI.isLiveIn(Reg)) {
   2594     VirtualRegister = MRI.createVirtualRegister(RC);
   2595     MRI.addLiveIn(Reg, VirtualRegister);
   2596   } else {
   2597     VirtualRegister = MRI.getLiveInVirtReg(Reg);
   2598   }
   2599   return DAG.getRegister(VirtualRegister, VT);
   2600 }
   2601 
   2602 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
   2603 
   2604 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   2605   switch (Opcode) {
   2606   default: return nullptr;
   2607   // AMDIL DAG nodes
   2608   NODE_NAME_CASE(CALL);
   2609   NODE_NAME_CASE(UMUL);
   2610   NODE_NAME_CASE(RET_FLAG);
   2611   NODE_NAME_CASE(BRANCH_COND);
   2612 
   2613   // AMDGPU DAG nodes
   2614   NODE_NAME_CASE(DWORDADDR)
   2615   NODE_NAME_CASE(FRACT)
   2616   NODE_NAME_CASE(CLAMP)
   2617   NODE_NAME_CASE(FMAX_LEGACY)
   2618   NODE_NAME_CASE(SMAX)
   2619   NODE_NAME_CASE(UMAX)
   2620   NODE_NAME_CASE(FMIN_LEGACY)
   2621   NODE_NAME_CASE(SMIN)
   2622   NODE_NAME_CASE(UMIN)
   2623   NODE_NAME_CASE(FMAX3)
   2624   NODE_NAME_CASE(SMAX3)
   2625   NODE_NAME_CASE(UMAX3)
   2626   NODE_NAME_CASE(FMIN3)
   2627   NODE_NAME_CASE(SMIN3)
   2628   NODE_NAME_CASE(UMIN3)
   2629   NODE_NAME_CASE(URECIP)
   2630   NODE_NAME_CASE(DIV_SCALE)
   2631   NODE_NAME_CASE(DIV_FMAS)
   2632   NODE_NAME_CASE(DIV_FIXUP)
   2633   NODE_NAME_CASE(TRIG_PREOP)
   2634   NODE_NAME_CASE(RCP)
   2635   NODE_NAME_CASE(RSQ)
   2636   NODE_NAME_CASE(RSQ_LEGACY)
   2637   NODE_NAME_CASE(RSQ_CLAMPED)
   2638   NODE_NAME_CASE(LDEXP)
   2639   NODE_NAME_CASE(FP_CLASS)
   2640   NODE_NAME_CASE(DOT4)
   2641   NODE_NAME_CASE(BFE_U32)
   2642   NODE_NAME_CASE(BFE_I32)
   2643   NODE_NAME_CASE(BFI)
   2644   NODE_NAME_CASE(BFM)
   2645   NODE_NAME_CASE(BREV)
   2646   NODE_NAME_CASE(MUL_U24)
   2647   NODE_NAME_CASE(MUL_I24)
   2648   NODE_NAME_CASE(MAD_U24)
   2649   NODE_NAME_CASE(MAD_I24)
   2650   NODE_NAME_CASE(EXPORT)
   2651   NODE_NAME_CASE(CONST_ADDRESS)
   2652   NODE_NAME_CASE(REGISTER_LOAD)
   2653   NODE_NAME_CASE(REGISTER_STORE)
   2654   NODE_NAME_CASE(LOAD_CONSTANT)
   2655   NODE_NAME_CASE(LOAD_INPUT)
   2656   NODE_NAME_CASE(SAMPLE)
   2657   NODE_NAME_CASE(SAMPLEB)
   2658   NODE_NAME_CASE(SAMPLED)
   2659   NODE_NAME_CASE(SAMPLEL)
   2660   NODE_NAME_CASE(CVT_F32_UBYTE0)
   2661   NODE_NAME_CASE(CVT_F32_UBYTE1)
   2662   NODE_NAME_CASE(CVT_F32_UBYTE2)
   2663   NODE_NAME_CASE(CVT_F32_UBYTE3)
   2664   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
   2665   NODE_NAME_CASE(CONST_DATA_PTR)
   2666   NODE_NAME_CASE(STORE_MSKOR)
   2667   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
   2668   }
   2669 }
   2670 
   2671 SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand,
   2672                                                DAGCombinerInfo &DCI,
   2673                                                unsigned &RefinementSteps,
   2674                                                bool &UseOneConstNR) const {
   2675   SelectionDAG &DAG = DCI.DAG;
   2676   EVT VT = Operand.getValueType();
   2677 
   2678   if (VT == MVT::f32) {
   2679     RefinementSteps = 0;
   2680     return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
   2681   }
   2682 
   2683   // TODO: There is also f64 rsq instruction, but the documentation is less
   2684   // clear on its precision.
   2685 
   2686   return SDValue();
   2687 }
   2688 
   2689 SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
   2690                                                DAGCombinerInfo &DCI,
   2691                                                unsigned &RefinementSteps) const {
   2692   SelectionDAG &DAG = DCI.DAG;
   2693   EVT VT = Operand.getValueType();
   2694 
   2695   if (VT == MVT::f32) {
   2696     // Reciprocal, < 1 ulp error.
   2697     //
   2698     // This reciprocal approximation converges to < 0.5 ulp error with one
   2699     // newton rhapson performed with two fused multiple adds (FMAs).
   2700 
   2701     RefinementSteps = 0;
   2702     return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
   2703   }
   2704 
   2705   // TODO: There is also f64 rcp instruction, but the documentation is less
   2706   // clear on its precision.
   2707 
   2708   return SDValue();
   2709 }
   2710 
   2711 static void computeKnownBitsForMinMax(const SDValue Op0,
   2712                                       const SDValue Op1,
   2713                                       APInt &KnownZero,
   2714                                       APInt &KnownOne,
   2715                                       const SelectionDAG &DAG,
   2716                                       unsigned Depth) {
   2717   APInt Op0Zero, Op0One;
   2718   APInt Op1Zero, Op1One;
   2719   DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth);
   2720   DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth);
   2721 
   2722   KnownZero = Op0Zero & Op1Zero;
   2723   KnownOne = Op0One & Op1One;
   2724 }
   2725 
   2726 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
   2727   const SDValue Op,
   2728   APInt &KnownZero,
   2729   APInt &KnownOne,
   2730   const SelectionDAG &DAG,
   2731   unsigned Depth) const {
   2732 
   2733   KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
   2734 
   2735   APInt KnownZero2;
   2736   APInt KnownOne2;
   2737   unsigned Opc = Op.getOpcode();
   2738 
   2739   switch (Opc) {
   2740   default:
   2741     break;
   2742   case ISD::INTRINSIC_WO_CHAIN: {
   2743     // FIXME: The intrinsic should just use the node.
   2744     switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
   2745     case AMDGPUIntrinsic::AMDGPU_imax:
   2746     case AMDGPUIntrinsic::AMDGPU_umax:
   2747     case AMDGPUIntrinsic::AMDGPU_imin:
   2748     case AMDGPUIntrinsic::AMDGPU_umin:
   2749       computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
   2750                                 KnownZero, KnownOne, DAG, Depth);
   2751       break;
   2752     default:
   2753       break;
   2754     }
   2755 
   2756     break;
   2757   }
   2758   case AMDGPUISD::SMAX:
   2759   case AMDGPUISD::UMAX:
   2760   case AMDGPUISD::SMIN:
   2761   case AMDGPUISD::UMIN:
   2762     computeKnownBitsForMinMax(Op.getOperand(0), Op.getOperand(1),
   2763                               KnownZero, KnownOne, DAG, Depth);
   2764     break;
   2765 
   2766   case AMDGPUISD::BFE_I32:
   2767   case AMDGPUISD::BFE_U32: {
   2768     ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
   2769     if (!CWidth)
   2770       return;
   2771 
   2772     unsigned BitWidth = 32;
   2773     uint32_t Width = CWidth->getZExtValue() & 0x1f;
   2774 
   2775     if (Opc == AMDGPUISD::BFE_U32)
   2776       KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
   2777 
   2778     break;
   2779   }
   2780   }
   2781 }
   2782 
   2783 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
   2784   SDValue Op,
   2785   const SelectionDAG &DAG,
   2786   unsigned Depth) const {
   2787   switch (Op.getOpcode()) {
   2788   case AMDGPUISD::BFE_I32: {
   2789     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
   2790     if (!Width)
   2791       return 1;
   2792 
   2793     unsigned SignBits = 32 - Width->getZExtValue() + 1;
   2794     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1));
   2795     if (!Offset || !Offset->isNullValue())
   2796       return SignBits;
   2797 
   2798     // TODO: Could probably figure something out with non-0 offsets.
   2799     unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
   2800     return std::max(SignBits, Op0SignBits);
   2801   }
   2802 
   2803   case AMDGPUISD::BFE_U32: {
   2804     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
   2805     return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
   2806   }
   2807 
   2808   default:
   2809     return 1;
   2810   }
   2811 }
   2812