Home | History | Annotate | Download | only in R600
      1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 /// \brief This is the parent TargetLowering class for hardware code gen
     12 /// targets.
     13 //
     14 //===----------------------------------------------------------------------===//
     15 
     16 #include "AMDGPUISelLowering.h"
     17 #include "AMDGPU.h"
     18 #include "AMDGPUFrameLowering.h"
     19 #include "AMDGPUIntrinsicInfo.h"
     20 #include "AMDGPURegisterInfo.h"
     21 #include "AMDGPUSubtarget.h"
     22 #include "R600MachineFunctionInfo.h"
     23 #include "SIMachineFunctionInfo.h"
     24 #include "llvm/Analysis/ValueTracking.h"
     25 #include "llvm/CodeGen/CallingConvLower.h"
     26 #include "llvm/CodeGen/MachineFunction.h"
     27 #include "llvm/CodeGen/MachineRegisterInfo.h"
     28 #include "llvm/CodeGen/SelectionDAG.h"
     29 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
     30 #include "llvm/IR/DataLayout.h"
     31 #include "llvm/IR/DiagnosticInfo.h"
     32 #include "llvm/IR/DiagnosticPrinter.h"
     33 
     34 using namespace llvm;
     35 
     36 namespace {
     37 
     38 /// Diagnostic information for unimplemented or unsupported feature reporting.
     39 class DiagnosticInfoUnsupported : public DiagnosticInfo {
     40 private:
     41   const Twine &Description;
     42   const Function &Fn;
     43 
     44   static int KindID;
     45 
     46   static int getKindID() {
     47     if (KindID == 0)
     48       KindID = llvm::getNextAvailablePluginDiagnosticKind();
     49     return KindID;
     50   }
     51 
     52 public:
     53   DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
     54                           DiagnosticSeverity Severity = DS_Error)
     55     : DiagnosticInfo(getKindID(), Severity),
     56       Description(Desc),
     57       Fn(Fn) { }
     58 
     59   const Function &getFunction() const { return Fn; }
     60   const Twine &getDescription() const { return Description; }
     61 
     62   void print(DiagnosticPrinter &DP) const override {
     63     DP << "unsupported " << getDescription() << " in " << Fn.getName();
     64   }
     65 
     66   static bool classof(const DiagnosticInfo *DI) {
     67     return DI->getKind() == getKindID();
     68   }
     69 };
     70 
     71 int DiagnosticInfoUnsupported::KindID = 0;
     72 }
     73 
     74 
     75 static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
     76                       CCValAssign::LocInfo LocInfo,
     77                       ISD::ArgFlagsTy ArgFlags, CCState &State) {
     78   unsigned Offset = State.AllocateStack(ValVT.getStoreSize(),
     79                                         ArgFlags.getOrigAlign());
     80   State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
     81 
     82   return true;
     83 }
     84 
     85 #include "AMDGPUGenCallingConv.inc"
     86 
     87 // Find a larger type to do a load / store of a vector with.
     88 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
     89   unsigned StoreSize = VT.getStoreSizeInBits();
     90   if (StoreSize <= 32)
     91     return EVT::getIntegerVT(Ctx, StoreSize);
     92 
     93   assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
     94   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
     95 }
     96 
     97 // Type for a vector that will be loaded to.
     98 EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) {
     99   unsigned StoreSize = VT.getStoreSizeInBits();
    100   if (StoreSize <= 32)
    101     return EVT::getIntegerVT(Ctx, 32);
    102 
    103   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
    104 }
    105 
    106 AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
    107   TargetLowering(TM, new TargetLoweringObjectFileELF()) {
    108 
    109   Subtarget = &TM.getSubtarget<AMDGPUSubtarget>();
    110 
    111   setOperationAction(ISD::Constant, MVT::i32, Legal);
    112   setOperationAction(ISD::Constant, MVT::i64, Legal);
    113   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
    114   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
    115 
    116   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
    117   setOperationAction(ISD::BRIND, MVT::Other, Expand);
    118 
    119   // We need to custom lower some of the intrinsics
    120   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
    121 
    122   // Library functions.  These default to Expand, but we have instructions
    123   // for them.
    124   setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
    125   setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
    126   setOperationAction(ISD::FPOW,   MVT::f32, Legal);
    127   setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
    128   setOperationAction(ISD::FABS,   MVT::f32, Legal);
    129   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
    130   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
    131   setOperationAction(ISD::FROUND, MVT::f32, Legal);
    132   setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
    133 
    134   // Lower floating point store/load to integer store/load to reduce the number
    135   // of patterns in tablegen.
    136   setOperationAction(ISD::STORE, MVT::f32, Promote);
    137   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
    138 
    139   setOperationAction(ISD::STORE, MVT::v2f32, Promote);
    140   AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
    141 
    142   setOperationAction(ISD::STORE, MVT::i64, Promote);
    143   AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
    144 
    145   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
    146   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
    147 
    148   setOperationAction(ISD::STORE, MVT::v8f32, Promote);
    149   AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
    150 
    151   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
    152   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
    153 
    154   setOperationAction(ISD::STORE, MVT::f64, Promote);
    155   AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
    156 
    157   setOperationAction(ISD::STORE, MVT::v2f64, Promote);
    158   AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64);
    159 
    160   // Custom lowering of vector stores is required for local address space
    161   // stores.
    162   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
    163   // XXX: Native v2i32 local address space stores are possible, but not
    164   // currently implemented.
    165   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
    166 
    167   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
    168   setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
    169   setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
    170 
    171   // XXX: This can be change to Custom, once ExpandVectorStores can
    172   // handle 64-bit stores.
    173   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
    174 
    175   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
    176   setTruncStoreAction(MVT::i64, MVT::i8, Expand);
    177   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
    178   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
    179   setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand);
    180 
    181 
    182   setOperationAction(ISD::LOAD, MVT::f32, Promote);
    183   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
    184 
    185   setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
    186   AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
    187 
    188   setOperationAction(ISD::LOAD, MVT::i64, Promote);
    189   AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
    190 
    191   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
    192   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
    193 
    194   setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
    195   AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
    196 
    197   setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
    198   AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
    199 
    200   setOperationAction(ISD::LOAD, MVT::f64, Promote);
    201   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
    202 
    203   setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
    204   AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64);
    205 
    206   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
    207   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
    208   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
    209   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
    210   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
    211   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
    212   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
    213   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
    214   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
    215   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
    216 
    217   setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Expand);
    218   setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Expand);
    219   setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i8, Expand);
    220   setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Expand);
    221   setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Expand);
    222   setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Expand);
    223   setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Expand);
    224   setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Expand);
    225   setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i16, Expand);
    226   setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Expand);
    227   setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Expand);
    228   setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, Expand);
    229 
    230   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
    231 
    232   if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
    233     setOperationAction(ISD::FCEIL, MVT::f64, Custom);
    234     setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
    235     setOperationAction(ISD::FRINT, MVT::f64, Custom);
    236     setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
    237   }
    238 
    239   if (!Subtarget->hasBFI()) {
    240     // fcopysign can be done in a single instruction with BFI.
    241     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
    242     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    243   }
    244 
    245   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
    246   for (MVT VT : ScalarIntVTs) {
    247     setOperationAction(ISD::SREM, VT, Expand);
    248     setOperationAction(ISD::SDIV, VT, Expand);
    249 
    250     // GPU does not have divrem function for signed or unsigned.
    251     setOperationAction(ISD::SDIVREM, VT, Custom);
    252     setOperationAction(ISD::UDIVREM, VT, Custom);
    253 
    254     // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
    255     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
    256     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
    257 
    258     setOperationAction(ISD::BSWAP, VT, Expand);
    259     setOperationAction(ISD::CTTZ, VT, Expand);
    260     setOperationAction(ISD::CTLZ, VT, Expand);
    261   }
    262 
    263   if (!Subtarget->hasBCNT(32))
    264     setOperationAction(ISD::CTPOP, MVT::i32, Expand);
    265 
    266   if (!Subtarget->hasBCNT(64))
    267     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
    268 
    269   // The hardware supports 32-bit ROTR, but not ROTL.
    270   setOperationAction(ISD::ROTL, MVT::i32, Expand);
    271   setOperationAction(ISD::ROTL, MVT::i64, Expand);
    272   setOperationAction(ISD::ROTR, MVT::i64, Expand);
    273 
    274   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
    275   setOperationAction(ISD::MUL, MVT::i64, Expand);
    276   setOperationAction(ISD::MULHU, MVT::i64, Expand);
    277   setOperationAction(ISD::MULHS, MVT::i64, Expand);
    278   setOperationAction(ISD::UDIV, MVT::i32, Expand);
    279   setOperationAction(ISD::UREM, MVT::i32, Expand);
    280   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
    281   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
    282 
    283   static const MVT::SimpleValueType VectorIntTypes[] = {
    284     MVT::v2i32, MVT::v4i32
    285   };
    286 
    287   for (MVT VT : VectorIntTypes) {
    288     // Expand the following operations for the current type by default.
    289     setOperationAction(ISD::ADD,  VT, Expand);
    290     setOperationAction(ISD::AND,  VT, Expand);
    291     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
    292     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
    293     setOperationAction(ISD::MUL,  VT, Expand);
    294     setOperationAction(ISD::OR,   VT, Expand);
    295     setOperationAction(ISD::SHL,  VT, Expand);
    296     setOperationAction(ISD::SRA,  VT, Expand);
    297     setOperationAction(ISD::SRL,  VT, Expand);
    298     setOperationAction(ISD::ROTL, VT, Expand);
    299     setOperationAction(ISD::ROTR, VT, Expand);
    300     setOperationAction(ISD::SUB,  VT, Expand);
    301     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
    302     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
    303     // TODO: Implement custom UREM / SREM routines.
    304     setOperationAction(ISD::SDIV, VT, Expand);
    305     setOperationAction(ISD::UDIV, VT, Expand);
    306     setOperationAction(ISD::SREM, VT, Expand);
    307     setOperationAction(ISD::UREM, VT, Expand);
    308     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
    309     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
    310     setOperationAction(ISD::SDIVREM, VT, Custom);
    311     setOperationAction(ISD::UDIVREM, VT, Custom);
    312     setOperationAction(ISD::ADDC, VT, Expand);
    313     setOperationAction(ISD::SUBC, VT, Expand);
    314     setOperationAction(ISD::ADDE, VT, Expand);
    315     setOperationAction(ISD::SUBE, VT, Expand);
    316     setOperationAction(ISD::SELECT, VT, Expand);
    317     setOperationAction(ISD::VSELECT, VT, Expand);
    318     setOperationAction(ISD::SELECT_CC, VT, Expand);
    319     setOperationAction(ISD::XOR,  VT, Expand);
    320     setOperationAction(ISD::BSWAP, VT, Expand);
    321     setOperationAction(ISD::CTPOP, VT, Expand);
    322     setOperationAction(ISD::CTTZ, VT, Expand);
    323     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
    324     setOperationAction(ISD::CTLZ, VT, Expand);
    325     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
    326     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
    327   }
    328 
    329   static const MVT::SimpleValueType FloatVectorTypes[] = {
    330     MVT::v2f32, MVT::v4f32
    331   };
    332 
    333   for (MVT VT : FloatVectorTypes) {
    334     setOperationAction(ISD::FABS, VT, Expand);
    335     setOperationAction(ISD::FADD, VT, Expand);
    336     setOperationAction(ISD::FCEIL, VT, Expand);
    337     setOperationAction(ISD::FCOS, VT, Expand);
    338     setOperationAction(ISD::FDIV, VT, Expand);
    339     setOperationAction(ISD::FEXP2, VT, Expand);
    340     setOperationAction(ISD::FLOG2, VT, Expand);
    341     setOperationAction(ISD::FPOW, VT, Expand);
    342     setOperationAction(ISD::FFLOOR, VT, Expand);
    343     setOperationAction(ISD::FTRUNC, VT, Expand);
    344     setOperationAction(ISD::FMUL, VT, Expand);
    345     setOperationAction(ISD::FMA, VT, Expand);
    346     setOperationAction(ISD::FRINT, VT, Expand);
    347     setOperationAction(ISD::FNEARBYINT, VT, Expand);
    348     setOperationAction(ISD::FSQRT, VT, Expand);
    349     setOperationAction(ISD::FSIN, VT, Expand);
    350     setOperationAction(ISD::FSUB, VT, Expand);
    351     setOperationAction(ISD::FNEG, VT, Expand);
    352     setOperationAction(ISD::SELECT, VT, Expand);
    353     setOperationAction(ISD::VSELECT, VT, Expand);
    354     setOperationAction(ISD::SELECT_CC, VT, Expand);
    355     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
    356     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
    357   }
    358 
    359   setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
    360   setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
    361 
    362   setTargetDAGCombine(ISD::MUL);
    363   setTargetDAGCombine(ISD::SELECT_CC);
    364 
    365   setSchedulingPreference(Sched::RegPressure);
    366   setJumpIsExpensive(true);
    367 
    368   setSelectIsExpensive(false);
    369   PredictableSelectIsExpensive = false;
    370 
    371   // There are no integer divide instructions, and these expand to a pretty
    372   // large sequence of instructions.
    373   setIntDivIsCheap(false);
    374   setPow2DivIsCheap(false);
    375 
    376   // TODO: Investigate this when 64-bit divides are implemented.
    377   addBypassSlowDiv(64, 32);
    378 
    379   // FIXME: Need to really handle these.
    380   MaxStoresPerMemcpy  = 4096;
    381   MaxStoresPerMemmove = 4096;
    382   MaxStoresPerMemset  = 4096;
    383 }
    384 
    385 //===----------------------------------------------------------------------===//
    386 // Target Information
    387 //===----------------------------------------------------------------------===//
    388 
    389 MVT AMDGPUTargetLowering::getVectorIdxTy() const {
    390   return MVT::i32;
    391 }
    392 
    393 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
    394   return true;
    395 }
    396 
    397 // The backend supports 32 and 64 bit floating point immediates.
    398 // FIXME: Why are we reporting vectors of FP immediates as legal?
    399 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
    400   EVT ScalarVT = VT.getScalarType();
    401   return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64);
    402 }
    403 
    404 // We don't want to shrink f64 / f32 constants.
    405 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
    406   EVT ScalarVT = VT.getScalarType();
    407   return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
    408 }
    409 
    410 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
    411                                                    EVT CastTy) const {
    412   if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
    413     return true;
    414 
    415   unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits();
    416   unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits();
    417 
    418   return ((LScalarSize <= CastScalarSize) ||
    419           (CastScalarSize >= 32) ||
    420           (LScalarSize < 32));
    421 }
    422 
    423 //===---------------------------------------------------------------------===//
    424 // Target Properties
    425 //===---------------------------------------------------------------------===//
    426 
    427 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
    428   assert(VT.isFloatingPoint());
    429   return VT == MVT::f32;
    430 }
    431 
    432 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
    433   assert(VT.isFloatingPoint());
    434   return VT == MVT::f32;
    435 }
    436 
    437 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
    438   // Truncate is just accessing a subregister.
    439   return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0);
    440 }
    441 
    442 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
    443   // Truncate is just accessing a subregister.
    444   return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() &&
    445          (Dest->getPrimitiveSizeInBits() % 32 == 0);
    446 }
    447 
    448 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
    449   const DataLayout *DL = getDataLayout();
    450   unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType());
    451   unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType());
    452 
    453   return SrcSize == 32 && DestSize == 64;
    454 }
    455 
    456 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
    457   // Any register load of a 64-bit value really requires 2 32-bit moves. For all
    458   // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
    459   // this will enable reducing 64-bit operations the 32-bit, which is always
    460   // good.
    461   return Src == MVT::i32 && Dest == MVT::i64;
    462 }
    463 
    464 bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
    465   return isZExtFree(Val.getValueType(), VT2);
    466 }
    467 
    468 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
    469   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
    470   // limited number of native 64-bit operations. Shrinking an operation to fit
    471   // in a single 32-bit register should always be helpful. As currently used,
    472   // this is much less general than the name suggests, and is only used in
    473   // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
    474   // not profitable, and may actually be harmful.
    475   return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
    476 }
    477 
    478 //===---------------------------------------------------------------------===//
    479 // TargetLowering Callbacks
    480 //===---------------------------------------------------------------------===//
    481 
    482 void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
    483                              const SmallVectorImpl<ISD::InputArg> &Ins) const {
    484 
    485   State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
    486 }
    487 
    488 SDValue AMDGPUTargetLowering::LowerReturn(
    489                                      SDValue Chain,
    490                                      CallingConv::ID CallConv,
    491                                      bool isVarArg,
    492                                      const SmallVectorImpl<ISD::OutputArg> &Outs,
    493                                      const SmallVectorImpl<SDValue> &OutVals,
    494                                      SDLoc DL, SelectionDAG &DAG) const {
    495   return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain);
    496 }
    497 
    498 //===---------------------------------------------------------------------===//
    499 // Target specific lowering
    500 //===---------------------------------------------------------------------===//
    501 
    502 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
    503                                         SmallVectorImpl<SDValue> &InVals) const {
    504   SDValue Callee = CLI.Callee;
    505   SelectionDAG &DAG = CLI.DAG;
    506 
    507   const Function &Fn = *DAG.getMachineFunction().getFunction();
    508 
    509   StringRef FuncName("<unknown>");
    510 
    511   if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
    512     FuncName = G->getSymbol();
    513   else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
    514     FuncName = G->getGlobal()->getName();
    515 
    516   DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName);
    517   DAG.getContext()->diagnose(NoCalls);
    518   return SDValue();
    519 }
    520 
    521 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
    522                                              SelectionDAG &DAG) const {
    523   switch (Op.getOpcode()) {
    524   default:
    525     Op.getNode()->dump();
    526     llvm_unreachable("Custom lowering code for this"
    527                      "instruction is not implemented yet!");
    528     break;
    529   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
    530   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
    531   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
    532   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
    533   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
    534   case ISD::SDIV: return LowerSDIV(Op, DAG);
    535   case ISD::SREM: return LowerSREM(Op, DAG);
    536   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
    537   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
    538   case ISD::FCEIL: return LowerFCEIL(Op, DAG);
    539   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
    540   case ISD::FRINT: return LowerFRINT(Op, DAG);
    541   case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
    542   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
    543   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
    544   }
    545   return Op;
    546 }
    547 
    548 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
    549                                               SmallVectorImpl<SDValue> &Results,
    550                                               SelectionDAG &DAG) const {
    551   switch (N->getOpcode()) {
    552   case ISD::SIGN_EXTEND_INREG:
    553     // Different parts of legalization seem to interpret which type of
    554     // sign_extend_inreg is the one to check for custom lowering. The extended
    555     // from type is what really matters, but some places check for custom
    556     // lowering of the result type. This results in trying to use
    557     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
    558     // nothing here and let the illegal result integer be handled normally.
    559     return;
    560   case ISD::LOAD: {
    561     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
    562     if (!Node)
    563       return;
    564 
    565     Results.push_back(SDValue(Node, 0));
    566     Results.push_back(SDValue(Node, 1));
    567     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
    568     // function
    569     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
    570     return;
    571   }
    572   case ISD::STORE: {
    573     SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG);
    574     if (Lowered.getNode())
    575       Results.push_back(Lowered);
    576     return;
    577   }
    578   default:
    579     return;
    580   }
    581 }
    582 
    583 // FIXME: This implements accesses to initialized globals in the constant
    584 // address space by copying them to private and accessing that. It does not
    585 // properly handle illegal types or vectors. The private vector loads are not
    586 // scalarized, and the illegal scalars hit an assertion. This technique will not
    587 // work well with large initializers, and this should eventually be
    588 // removed. Initialized globals should be placed into a data section that the
    589 // runtime will load into a buffer before the kernel is executed. Uses of the
    590 // global need to be replaced with a pointer loaded from an implicit kernel
    591 // argument into this buffer holding the copy of the data, which will remove the
    592 // need for any of this.
    593 SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
    594                                                        const GlobalValue *GV,
    595                                                        const SDValue &InitPtr,
    596                                                        SDValue Chain,
    597                                                        SelectionDAG &DAG) const {
    598   const DataLayout *TD = getTargetMachine().getDataLayout();
    599   SDLoc DL(InitPtr);
    600   Type *InitTy = Init->getType();
    601 
    602   if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) {
    603     EVT VT = EVT::getEVT(InitTy);
    604     PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
    605     return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr,
    606                         MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
    607                         TD->getPrefTypeAlignment(InitTy));
    608   }
    609 
    610   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
    611     EVT VT = EVT::getEVT(CFP->getType());
    612     PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
    613     return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, VT), InitPtr,
    614                  MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
    615                  TD->getPrefTypeAlignment(CFP->getType()));
    616   }
    617 
    618   if (StructType *ST = dyn_cast<StructType>(InitTy)) {
    619     const StructLayout *SL = TD->getStructLayout(ST);
    620 
    621     EVT PtrVT = InitPtr.getValueType();
    622     SmallVector<SDValue, 8> Chains;
    623 
    624     for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) {
    625       SDValue Offset = DAG.getConstant(SL->getElementOffset(I), PtrVT);
    626       SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
    627 
    628       Constant *Elt = Init->getAggregateElement(I);
    629       Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
    630     }
    631 
    632     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
    633   }
    634 
    635   if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) {
    636     EVT PtrVT = InitPtr.getValueType();
    637 
    638     unsigned NumElements;
    639     if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy))
    640       NumElements = AT->getNumElements();
    641     else if (VectorType *VT = dyn_cast<VectorType>(SeqTy))
    642       NumElements = VT->getNumElements();
    643     else
    644       llvm_unreachable("Unexpected type");
    645 
    646     unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType());
    647     SmallVector<SDValue, 8> Chains;
    648     for (unsigned i = 0; i < NumElements; ++i) {
    649       SDValue Offset = DAG.getConstant(i * EltSize, PtrVT);
    650       SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
    651 
    652       Constant *Elt = Init->getAggregateElement(i);
    653       Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
    654     }
    655 
    656     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
    657   }
    658 
    659   if (isa<UndefValue>(Init)) {
    660     EVT VT = EVT::getEVT(InitTy);
    661     PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
    662     return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr,
    663                         MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
    664                         TD->getPrefTypeAlignment(InitTy));
    665   }
    666 
    667   Init->dump();
    668   llvm_unreachable("Unhandled constant initializer");
    669 }
    670 
    671 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
    672                                                  SDValue Op,
    673                                                  SelectionDAG &DAG) const {
    674 
    675   const DataLayout *TD = getTargetMachine().getDataLayout();
    676   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
    677   const GlobalValue *GV = G->getGlobal();
    678 
    679   switch (G->getAddressSpace()) {
    680   default: llvm_unreachable("Global Address lowering not implemented for this "
    681                             "address space");
    682   case AMDGPUAS::LOCAL_ADDRESS: {
    683     // XXX: What does the value of G->getOffset() mean?
    684     assert(G->getOffset() == 0 &&
    685          "Do not know what to do with an non-zero offset");
    686 
    687     unsigned Offset;
    688     if (MFI->LocalMemoryObjects.count(GV) == 0) {
    689       uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
    690       Offset = MFI->LDSSize;
    691       MFI->LocalMemoryObjects[GV] = Offset;
    692       // XXX: Account for alignment?
    693       MFI->LDSSize += Size;
    694     } else {
    695       Offset = MFI->LocalMemoryObjects[GV];
    696     }
    697 
    698     return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace()));
    699   }
    700   case AMDGPUAS::CONSTANT_ADDRESS: {
    701     MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
    702     Type *EltType = GV->getType()->getElementType();
    703     unsigned Size = TD->getTypeAllocSize(EltType);
    704     unsigned Alignment = TD->getPrefTypeAlignment(EltType);
    705 
    706     MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS);
    707     MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS);
    708 
    709     int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
    710     SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT);
    711 
    712     const GlobalVariable *Var = cast<GlobalVariable>(GV);
    713     if (!Var->hasInitializer()) {
    714       // This has no use, but bugpoint will hit it.
    715       return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
    716     }
    717 
    718     const Constant *Init = Var->getInitializer();
    719     SmallVector<SDNode*, 8> WorkList;
    720 
    721     for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(),
    722                               E = DAG.getEntryNode()->use_end(); I != E; ++I) {
    723       if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD)
    724         continue;
    725       WorkList.push_back(*I);
    726     }
    727     SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG);
    728     for (SmallVector<SDNode*, 8>::iterator I = WorkList.begin(),
    729                                            E = WorkList.end(); I != E; ++I) {
    730       SmallVector<SDValue, 8> Ops;
    731       Ops.push_back(Chain);
    732       for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) {
    733         Ops.push_back((*I)->getOperand(i));
    734       }
    735       DAG.UpdateNodeOperands(*I, Ops);
    736     }
    737     return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
    738   }
    739   }
    740 }
    741 
    742 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
    743                                                   SelectionDAG &DAG) const {
    744   SmallVector<SDValue, 8> Args;
    745   SDValue A = Op.getOperand(0);
    746   SDValue B = Op.getOperand(1);
    747 
    748   DAG.ExtractVectorElements(A, Args);
    749   DAG.ExtractVectorElements(B, Args);
    750 
    751   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
    752 }
    753 
    754 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
    755                                                      SelectionDAG &DAG) const {
    756 
    757   SmallVector<SDValue, 8> Args;
    758   unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    759   EVT VT = Op.getValueType();
    760   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
    761                             VT.getVectorNumElements());
    762 
    763   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
    764 }
    765 
    766 SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
    767                                               SelectionDAG &DAG) const {
    768 
    769   MachineFunction &MF = DAG.getMachineFunction();
    770   const AMDGPUFrameLowering *TFL =
    771    static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
    772 
    773   FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
    774 
    775   unsigned FrameIndex = FIN->getIndex();
    776   unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
    777   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF),
    778                          Op.getValueType());
    779 }
    780 
    781 SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
    782     SelectionDAG &DAG) const {
    783   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    784   SDLoc DL(Op);
    785   EVT VT = Op.getValueType();
    786 
    787   switch (IntrinsicID) {
    788     default: return Op;
    789     case AMDGPUIntrinsic::AMDGPU_abs:
    790     case AMDGPUIntrinsic::AMDIL_abs: // Legacy name.
    791       return LowerIntrinsicIABS(Op, DAG);
    792     case AMDGPUIntrinsic::AMDGPU_lrp:
    793       return LowerIntrinsicLRP(Op, DAG);
    794     case AMDGPUIntrinsic::AMDGPU_fract:
    795     case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
    796       return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
    797 
    798     case AMDGPUIntrinsic::AMDGPU_clamp:
    799     case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name.
    800       return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
    801                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    802 
    803     case Intrinsic::AMDGPU_div_scale: {
    804       // 3rd parameter required to be a constant.
    805       const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
    806       if (!Param)
    807         return DAG.getUNDEF(VT);
    808 
    809       // Translate to the operands expected by the machine instruction. The
    810       // first parameter must be the same as the first instruction.
    811       SDValue Numerator = Op.getOperand(1);
    812       SDValue Denominator = Op.getOperand(2);
    813       SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
    814 
    815       return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, VT,
    816                          Src0, Denominator, Numerator);
    817     }
    818 
    819     case Intrinsic::AMDGPU_div_fmas:
    820       return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
    821                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    822 
    823     case Intrinsic::AMDGPU_div_fixup:
    824       return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
    825                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    826 
    827     case Intrinsic::AMDGPU_trig_preop:
    828       return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
    829                          Op.getOperand(1), Op.getOperand(2));
    830 
    831     case Intrinsic::AMDGPU_rcp:
    832       return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
    833 
    834     case Intrinsic::AMDGPU_rsq:
    835       return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
    836 
    837     case AMDGPUIntrinsic::AMDGPU_legacy_rsq:
    838       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
    839 
    840     case Intrinsic::AMDGPU_rsq_clamped:
    841       return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
    842 
    843     case AMDGPUIntrinsic::AMDGPU_imax:
    844       return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1),
    845                                                   Op.getOperand(2));
    846     case AMDGPUIntrinsic::AMDGPU_umax:
    847       return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1),
    848                                                   Op.getOperand(2));
    849     case AMDGPUIntrinsic::AMDGPU_imin:
    850       return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1),
    851                                                   Op.getOperand(2));
    852     case AMDGPUIntrinsic::AMDGPU_umin:
    853       return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1),
    854                                                   Op.getOperand(2));
    855 
    856     case AMDGPUIntrinsic::AMDGPU_umul24:
    857       return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT,
    858                          Op.getOperand(1), Op.getOperand(2));
    859 
    860     case AMDGPUIntrinsic::AMDGPU_imul24:
    861       return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT,
    862                          Op.getOperand(1), Op.getOperand(2));
    863 
    864     case AMDGPUIntrinsic::AMDGPU_umad24:
    865       return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT,
    866                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    867 
    868     case AMDGPUIntrinsic::AMDGPU_imad24:
    869       return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT,
    870                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    871 
    872     case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0:
    873       return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1));
    874 
    875     case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1:
    876       return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1));
    877 
    878     case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2:
    879       return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1));
    880 
    881     case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3:
    882       return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1));
    883 
    884     case AMDGPUIntrinsic::AMDGPU_bfe_i32:
    885       return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
    886                          Op.getOperand(1),
    887                          Op.getOperand(2),
    888                          Op.getOperand(3));
    889 
    890     case AMDGPUIntrinsic::AMDGPU_bfe_u32:
    891       return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
    892                          Op.getOperand(1),
    893                          Op.getOperand(2),
    894                          Op.getOperand(3));
    895 
    896     case AMDGPUIntrinsic::AMDGPU_bfi:
    897       return DAG.getNode(AMDGPUISD::BFI, DL, VT,
    898                          Op.getOperand(1),
    899                          Op.getOperand(2),
    900                          Op.getOperand(3));
    901 
    902     case AMDGPUIntrinsic::AMDGPU_bfm:
    903       return DAG.getNode(AMDGPUISD::BFM, DL, VT,
    904                          Op.getOperand(1),
    905                          Op.getOperand(2));
    906 
    907     case AMDGPUIntrinsic::AMDGPU_brev:
    908       return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1));
    909 
    910     case AMDGPUIntrinsic::AMDIL_exp: // Legacy name.
    911       return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
    912 
    913     case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name.
    914       return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
    915     case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name.
    916       return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1));
    917   }
    918 }
    919 
    920 ///IABS(a) = SMAX(sub(0, a), a)
    921 SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
    922                                                  SelectionDAG &DAG) const {
    923   SDLoc DL(Op);
    924   EVT VT = Op.getValueType();
    925   SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
    926                                               Op.getOperand(1));
    927 
    928   return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Neg, Op.getOperand(1));
    929 }
    930 
    931 /// Linear Interpolation
    932 /// LRP(a, b, c) = muladd(a,  b, (1 - a) * c)
    933 SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
    934                                                 SelectionDAG &DAG) const {
    935   SDLoc DL(Op);
    936   EVT VT = Op.getValueType();
    937   SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
    938                                 DAG.getConstantFP(1.0f, MVT::f32),
    939                                 Op.getOperand(1));
    940   SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA,
    941                                                     Op.getOperand(3));
    942   return DAG.getNode(ISD::FADD, DL, VT,
    943       DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)),
    944       OneSubAC);
    945 }
    946 
    947 /// \brief Generate Min/Max node
    948 SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N,
    949                                             SelectionDAG &DAG) const {
    950   SDLoc DL(N);
    951   EVT VT = N->getValueType(0);
    952 
    953   SDValue LHS = N->getOperand(0);
    954   SDValue RHS = N->getOperand(1);
    955   SDValue True = N->getOperand(2);
    956   SDValue False = N->getOperand(3);
    957   SDValue CC = N->getOperand(4);
    958 
    959   if (VT != MVT::f32 ||
    960       !((LHS == True && RHS == False) || (LHS == False && RHS == True))) {
    961     return SDValue();
    962   }
    963 
    964   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
    965   switch (CCOpcode) {
    966   case ISD::SETOEQ:
    967   case ISD::SETONE:
    968   case ISD::SETUNE:
    969   case ISD::SETNE:
    970   case ISD::SETUEQ:
    971   case ISD::SETEQ:
    972   case ISD::SETFALSE:
    973   case ISD::SETFALSE2:
    974   case ISD::SETTRUE:
    975   case ISD::SETTRUE2:
    976   case ISD::SETUO:
    977   case ISD::SETO:
    978     llvm_unreachable("Operation should already be optimised!");
    979   case ISD::SETULE:
    980   case ISD::SETULT:
    981   case ISD::SETOLE:
    982   case ISD::SETOLT:
    983   case ISD::SETLE:
    984   case ISD::SETLT: {
    985     unsigned Opc = (LHS == True) ? AMDGPUISD::FMIN : AMDGPUISD::FMAX;
    986     return DAG.getNode(Opc, DL, VT, LHS, RHS);
    987   }
    988   case ISD::SETGT:
    989   case ISD::SETGE:
    990   case ISD::SETUGE:
    991   case ISD::SETOGE:
    992   case ISD::SETUGT:
    993   case ISD::SETOGT: {
    994     unsigned Opc = (LHS == True) ? AMDGPUISD::FMAX : AMDGPUISD::FMIN;
    995     return DAG.getNode(Opc, DL, VT, LHS, RHS);
    996   }
    997   case ISD::SETCC_INVALID:
    998     llvm_unreachable("Invalid setcc condcode!");
    999   }
   1000   return SDValue();
   1001 }
   1002 
   1003 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
   1004                                               SelectionDAG &DAG) const {
   1005   LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
   1006   EVT MemEltVT = Load->getMemoryVT().getVectorElementType();
   1007   EVT LoadVT = Op.getValueType();
   1008   EVT EltVT = Op.getValueType().getVectorElementType();
   1009   EVT PtrVT = Load->getBasePtr().getValueType();
   1010 
   1011   unsigned NumElts = Load->getMemoryVT().getVectorNumElements();
   1012   SmallVector<SDValue, 8> Loads;
   1013   SmallVector<SDValue, 8> Chains;
   1014 
   1015   SDLoc SL(Op);
   1016 
   1017   for (unsigned i = 0, e = NumElts; i != e; ++i) {
   1018     SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(),
   1019                     DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT));
   1020 
   1021     SDValue NewLoad
   1022       = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
   1023                        Load->getChain(), Ptr,
   1024                        MachinePointerInfo(Load->getMemOperand()->getValue()),
   1025                        MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
   1026                        Load->getAlignment());
   1027     Loads.push_back(NewLoad.getValue(0));
   1028     Chains.push_back(NewLoad.getValue(1));
   1029   }
   1030 
   1031   SDValue Ops[] = {
   1032     DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads),
   1033     DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains)
   1034   };
   1035 
   1036   return DAG.getMergeValues(Ops, SL);
   1037 }
   1038 
   1039 SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
   1040                                                SelectionDAG &DAG) const {
   1041   StoreSDNode *Store = cast<StoreSDNode>(Op);
   1042   EVT MemVT = Store->getMemoryVT();
   1043   unsigned MemBits = MemVT.getSizeInBits();
   1044 
   1045   // Byte stores are really expensive, so if possible, try to pack 32-bit vector
   1046   // truncating store into an i32 store.
   1047   // XXX: We could also handle optimize other vector bitwidths.
   1048   if (!MemVT.isVector() || MemBits > 32) {
   1049     return SDValue();
   1050   }
   1051 
   1052   SDLoc DL(Op);
   1053   SDValue Value = Store->getValue();
   1054   EVT VT = Value.getValueType();
   1055   EVT ElemVT = VT.getVectorElementType();
   1056   SDValue Ptr = Store->getBasePtr();
   1057   EVT MemEltVT = MemVT.getVectorElementType();
   1058   unsigned MemEltBits = MemEltVT.getSizeInBits();
   1059   unsigned MemNumElements = MemVT.getVectorNumElements();
   1060   unsigned PackedSize = MemVT.getStoreSizeInBits();
   1061   SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, MVT::i32);
   1062 
   1063   assert(Value.getValueType().getScalarSizeInBits() >= 32);
   1064 
   1065   SDValue PackedValue;
   1066   for (unsigned i = 0; i < MemNumElements; ++i) {
   1067     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value,
   1068                               DAG.getConstant(i, MVT::i32));
   1069     Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32);
   1070     Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg
   1071 
   1072     SDValue Shift = DAG.getConstant(MemEltBits * i, MVT::i32);
   1073     Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift);
   1074 
   1075     if (i == 0) {
   1076       PackedValue = Elt;
   1077     } else {
   1078       PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt);
   1079     }
   1080   }
   1081 
   1082   if (PackedSize < 32) {
   1083     EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize);
   1084     return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr,
   1085                              Store->getMemOperand()->getPointerInfo(),
   1086                              PackedVT,
   1087                              Store->isNonTemporal(), Store->isVolatile(),
   1088                              Store->getAlignment());
   1089   }
   1090 
   1091   return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
   1092                       Store->getMemOperand()->getPointerInfo(),
   1093                       Store->isVolatile(),  Store->isNonTemporal(),
   1094                       Store->getAlignment());
   1095 }
   1096 
   1097 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
   1098                                             SelectionDAG &DAG) const {
   1099   StoreSDNode *Store = cast<StoreSDNode>(Op);
   1100   EVT MemEltVT = Store->getMemoryVT().getVectorElementType();
   1101   EVT EltVT = Store->getValue().getValueType().getVectorElementType();
   1102   EVT PtrVT = Store->getBasePtr().getValueType();
   1103   unsigned NumElts = Store->getMemoryVT().getVectorNumElements();
   1104   SDLoc SL(Op);
   1105 
   1106   SmallVector<SDValue, 8> Chains;
   1107 
   1108   for (unsigned i = 0, e = NumElts; i != e; ++i) {
   1109     SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
   1110                               Store->getValue(), DAG.getConstant(i, MVT::i32));
   1111     SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT,
   1112                               Store->getBasePtr(),
   1113                             DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8),
   1114                                             PtrVT));
   1115     Chains.push_back(DAG.getTruncStore(Store->getChain(), SL, Val, Ptr,
   1116                          MachinePointerInfo(Store->getMemOperand()->getValue()),
   1117                          MemEltVT, Store->isVolatile(), Store->isNonTemporal(),
   1118                          Store->getAlignment()));
   1119   }
   1120   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains);
   1121 }
   1122 
   1123 SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   1124   SDLoc DL(Op);
   1125   LoadSDNode *Load = cast<LoadSDNode>(Op);
   1126   ISD::LoadExtType ExtType = Load->getExtensionType();
   1127   EVT VT = Op.getValueType();
   1128   EVT MemVT = Load->getMemoryVT();
   1129 
   1130   if (ExtType != ISD::NON_EXTLOAD && !VT.isVector() && VT.getSizeInBits() > 32) {
   1131     // We can do the extload to 32-bits, and then need to separately extend to
   1132     // 64-bits.
   1133 
   1134     SDValue ExtLoad32 = DAG.getExtLoad(ExtType, DL, MVT::i32,
   1135                                        Load->getChain(),
   1136                                        Load->getBasePtr(),
   1137                                        MemVT,
   1138                                        Load->getMemOperand());
   1139 
   1140     SDValue Ops[] = {
   1141       DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32),
   1142       ExtLoad32.getValue(1)
   1143     };
   1144 
   1145     return DAG.getMergeValues(Ops, DL);
   1146   }
   1147 
   1148   if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
   1149     assert(VT == MVT::i1 && "Only i1 non-extloads expected");
   1150     // FIXME: Copied from PPC
   1151     // First, load into 32 bits, then truncate to 1 bit.
   1152 
   1153     SDValue Chain = Load->getChain();
   1154     SDValue BasePtr = Load->getBasePtr();
   1155     MachineMemOperand *MMO = Load->getMemOperand();
   1156 
   1157     SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
   1158                                    BasePtr, MVT::i8, MMO);
   1159 
   1160     SDValue Ops[] = {
   1161       DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD),
   1162       NewLD.getValue(1)
   1163     };
   1164 
   1165     return DAG.getMergeValues(Ops, DL);
   1166   }
   1167 
   1168   // Lower loads constant address space global variable loads
   1169   if (Load->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
   1170       isa<GlobalVariable>(
   1171           GetUnderlyingObject(Load->getMemOperand()->getValue()))) {
   1172 
   1173 
   1174     SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL,
   1175         getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
   1176     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
   1177         DAG.getConstant(2, MVT::i32));
   1178     return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
   1179                        Load->getChain(), Ptr,
   1180                        DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
   1181   }
   1182 
   1183   if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS ||
   1184       ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32))
   1185     return SDValue();
   1186 
   1187 
   1188   SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
   1189                             DAG.getConstant(2, MVT::i32));
   1190   SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
   1191                             Load->getChain(), Ptr,
   1192                             DAG.getTargetConstant(0, MVT::i32),
   1193                             Op.getOperand(2));
   1194   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
   1195                                 Load->getBasePtr(),
   1196                                 DAG.getConstant(0x3, MVT::i32));
   1197   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
   1198                                  DAG.getConstant(3, MVT::i32));
   1199 
   1200   Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
   1201 
   1202   EVT MemEltVT = MemVT.getScalarType();
   1203   if (ExtType == ISD::SEXTLOAD) {
   1204     SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
   1205 
   1206     SDValue Ops[] = {
   1207       DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
   1208       Load->getChain()
   1209     };
   1210 
   1211     return DAG.getMergeValues(Ops, DL);
   1212   }
   1213 
   1214   SDValue Ops[] = {
   1215     DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
   1216     Load->getChain()
   1217   };
   1218 
   1219   return DAG.getMergeValues(Ops, DL);
   1220 }
   1221 
   1222 SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   1223   SDLoc DL(Op);
   1224   SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG);
   1225   if (Result.getNode()) {
   1226     return Result;
   1227   }
   1228 
   1229   StoreSDNode *Store = cast<StoreSDNode>(Op);
   1230   SDValue Chain = Store->getChain();
   1231   if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
   1232        Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
   1233       Store->getValue().getValueType().isVector()) {
   1234     return SplitVectorStore(Op, DAG);
   1235   }
   1236 
   1237   EVT MemVT = Store->getMemoryVT();
   1238   if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
   1239       MemVT.bitsLT(MVT::i32)) {
   1240     unsigned Mask = 0;
   1241     if (Store->getMemoryVT() == MVT::i8) {
   1242       Mask = 0xff;
   1243     } else if (Store->getMemoryVT() == MVT::i16) {
   1244       Mask = 0xffff;
   1245     }
   1246     SDValue BasePtr = Store->getBasePtr();
   1247     SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
   1248                               DAG.getConstant(2, MVT::i32));
   1249     SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
   1250                               Chain, Ptr, DAG.getTargetConstant(0, MVT::i32));
   1251 
   1252     SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
   1253                                   DAG.getConstant(0x3, MVT::i32));
   1254 
   1255     SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
   1256                                    DAG.getConstant(3, MVT::i32));
   1257 
   1258     SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
   1259                                     Store->getValue());
   1260 
   1261     SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
   1262 
   1263     SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
   1264                                        MaskedValue, ShiftAmt);
   1265 
   1266     SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(Mask, MVT::i32),
   1267                                   ShiftAmt);
   1268     DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
   1269                           DAG.getConstant(0xffffffff, MVT::i32));
   1270     Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
   1271 
   1272     SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
   1273     return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
   1274                        Chain, Value, Ptr, DAG.getTargetConstant(0, MVT::i32));
   1275   }
   1276   return SDValue();
   1277 }
   1278 
   1279 SDValue AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
   1280   SDLoc DL(Op);
   1281   EVT OVT = Op.getValueType();
   1282   SDValue LHS = Op.getOperand(0);
   1283   SDValue RHS = Op.getOperand(1);
   1284   MVT INTTY;
   1285   MVT FLTTY;
   1286   if (!OVT.isVector()) {
   1287     INTTY = MVT::i32;
   1288     FLTTY = MVT::f32;
   1289   } else if (OVT.getVectorNumElements() == 2) {
   1290     INTTY = MVT::v2i32;
   1291     FLTTY = MVT::v2f32;
   1292   } else if (OVT.getVectorNumElements() == 4) {
   1293     INTTY = MVT::v4i32;
   1294     FLTTY = MVT::v4f32;
   1295   }
   1296   unsigned bitsize = OVT.getScalarType().getSizeInBits();
   1297   // char|short jq = ia ^ ib;
   1298   SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS);
   1299 
   1300   // jq = jq >> (bitsize - 2)
   1301   jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT));
   1302 
   1303   // jq = jq | 0x1
   1304   jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT));
   1305 
   1306   // jq = (int)jq
   1307   jq = DAG.getSExtOrTrunc(jq, DL, INTTY);
   1308 
   1309   // int ia = (int)LHS;
   1310   SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY);
   1311 
   1312   // int ib, (int)RHS;
   1313   SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY);
   1314 
   1315   // float fa = (float)ia;
   1316   SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);
   1317 
   1318   // float fb = (float)ib;
   1319   SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);
   1320 
   1321   // float fq = native_divide(fa, fb);
   1322   SDValue fq = DAG.getNode(ISD::FMUL, DL, FLTTY,
   1323                            fa, DAG.getNode(AMDGPUISD::RCP, DL, FLTTY, fb));
   1324 
   1325   // fq = trunc(fq);
   1326   fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);
   1327 
   1328   // float fqneg = -fq;
   1329   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);
   1330 
   1331   // float fr = mad(fqneg, fb, fa);
   1332   SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY,
   1333       DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa);
   1334 
   1335   // int iq = (int)fq;
   1336   SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);
   1337 
   1338   // fr = fabs(fr);
   1339   fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr);
   1340 
   1341   // fb = fabs(fb);
   1342   fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb);
   1343 
   1344   // int cv = fr >= fb;
   1345   SDValue cv;
   1346   if (INTTY == MVT::i32) {
   1347     cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
   1348   } else {
   1349     cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
   1350   }
   1351   // jq = (cv ? jq : 0);
   1352   jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq,
   1353       DAG.getConstant(0, OVT));
   1354   // dst = iq + jq;
   1355   iq = DAG.getSExtOrTrunc(iq, DL, OVT);
   1356   iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq);
   1357   return iq;
   1358 }
   1359 
   1360 SDValue AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
   1361   SDLoc DL(Op);
   1362   EVT OVT = Op.getValueType();
   1363   SDValue LHS = Op.getOperand(0);
   1364   SDValue RHS = Op.getOperand(1);
   1365   // The LowerSDIV32 function generates equivalent to the following IL.
   1366   // mov r0, LHS
   1367   // mov r1, RHS
   1368   // ilt r10, r0, 0
   1369   // ilt r11, r1, 0
   1370   // iadd r0, r0, r10
   1371   // iadd r1, r1, r11
   1372   // ixor r0, r0, r10
   1373   // ixor r1, r1, r11
   1374   // udiv r0, r0, r1
   1375   // ixor r10, r10, r11
   1376   // iadd r0, r0, r10
   1377   // ixor DST, r0, r10
   1378 
   1379   // mov r0, LHS
   1380   SDValue r0 = LHS;
   1381 
   1382   // mov r1, RHS
   1383   SDValue r1 = RHS;
   1384 
   1385   // ilt r10, r0, 0
   1386   SDValue r10 = DAG.getSelectCC(DL,
   1387       r0, DAG.getConstant(0, OVT),
   1388       DAG.getConstant(-1, OVT),
   1389       DAG.getConstant(0, OVT),
   1390       ISD::SETLT);
   1391 
   1392   // ilt r11, r1, 0
   1393   SDValue r11 = DAG.getSelectCC(DL,
   1394       r1, DAG.getConstant(0, OVT),
   1395       DAG.getConstant(-1, OVT),
   1396       DAG.getConstant(0, OVT),
   1397       ISD::SETLT);
   1398 
   1399   // iadd r0, r0, r10
   1400   r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
   1401 
   1402   // iadd r1, r1, r11
   1403   r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
   1404 
   1405   // ixor r0, r0, r10
   1406   r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
   1407 
   1408   // ixor r1, r1, r11
   1409   r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
   1410 
   1411   // udiv r0, r0, r1
   1412   r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1);
   1413 
   1414   // ixor r10, r10, r11
   1415   r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11);
   1416 
   1417   // iadd r0, r0, r10
   1418   r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
   1419 
   1420   // ixor DST, r0, r10
   1421   SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
   1422   return DST;
   1423 }
   1424 
   1425 SDValue AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const {
   1426   return SDValue(Op.getNode(), 0);
   1427 }
   1428 
   1429 SDValue AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
   1430   EVT OVT = Op.getValueType().getScalarType();
   1431 
   1432   if (OVT == MVT::i64)
   1433     return LowerSDIV64(Op, DAG);
   1434 
   1435   if (OVT.getScalarType() == MVT::i32)
   1436     return LowerSDIV32(Op, DAG);
   1437 
   1438   if (OVT == MVT::i16 || OVT == MVT::i8) {
   1439     // FIXME: We should be checking for the masked bits. This isn't reached
   1440     // because i8 and i16 are not legal types.
   1441     return LowerSDIV24(Op, DAG);
   1442   }
   1443 
   1444   return SDValue(Op.getNode(), 0);
   1445 }
   1446 
   1447 SDValue AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const {
   1448   SDLoc DL(Op);
   1449   EVT OVT = Op.getValueType();
   1450   SDValue LHS = Op.getOperand(0);
   1451   SDValue RHS = Op.getOperand(1);
   1452   // The LowerSREM32 function generates equivalent to the following IL.
   1453   // mov r0, LHS
   1454   // mov r1, RHS
   1455   // ilt r10, r0, 0
   1456   // ilt r11, r1, 0
   1457   // iadd r0, r0, r10
   1458   // iadd r1, r1, r11
   1459   // ixor r0, r0, r10
   1460   // ixor r1, r1, r11
   1461   // udiv r20, r0, r1
   1462   // umul r20, r20, r1
   1463   // sub r0, r0, r20
   1464   // iadd r0, r0, r10
   1465   // ixor DST, r0, r10
   1466 
   1467   // mov r0, LHS
   1468   SDValue r0 = LHS;
   1469 
   1470   // mov r1, RHS
   1471   SDValue r1 = RHS;
   1472 
   1473   // ilt r10, r0, 0
   1474   SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT);
   1475 
   1476   // ilt r11, r1, 0
   1477   SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT);
   1478 
   1479   // iadd r0, r0, r10
   1480   r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
   1481 
   1482   // iadd r1, r1, r11
   1483   r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
   1484 
   1485   // ixor r0, r0, r10
   1486   r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
   1487 
   1488   // ixor r1, r1, r11
   1489   r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
   1490 
   1491   // udiv r20, r0, r1
   1492   SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1);
   1493 
   1494   // umul r20, r20, r1
   1495   r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1);
   1496 
   1497   // sub r0, r0, r20
   1498   r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20);
   1499 
   1500   // iadd r0, r0, r10
   1501   r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
   1502 
   1503   // ixor DST, r0, r10
   1504   SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
   1505   return DST;
   1506 }
   1507 
   1508 SDValue AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const {
   1509   return SDValue(Op.getNode(), 0);
   1510 }
   1511 
   1512 SDValue AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const {
   1513   EVT OVT = Op.getValueType();
   1514 
   1515   if (OVT.getScalarType() == MVT::i64)
   1516     return LowerSREM64(Op, DAG);
   1517 
   1518   if (OVT.getScalarType() == MVT::i32)
   1519     return LowerSREM32(Op, DAG);
   1520 
   1521   return SDValue(Op.getNode(), 0);
   1522 }
   1523 
   1524 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
   1525                                            SelectionDAG &DAG) const {
   1526   SDLoc DL(Op);
   1527   EVT VT = Op.getValueType();
   1528 
   1529   SDValue Num = Op.getOperand(0);
   1530   SDValue Den = Op.getOperand(1);
   1531 
   1532   // RCP =  URECIP(Den) = 2^32 / Den + e
   1533   // e is rounding error.
   1534   SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
   1535 
   1536   // RCP_LO = umulo(RCP, Den) */
   1537   SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den);
   1538 
   1539   // RCP_HI = mulhu (RCP, Den) */
   1540   SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
   1541 
   1542   // NEG_RCP_LO = -RCP_LO
   1543   SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
   1544                                                      RCP_LO);
   1545 
   1546   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
   1547   SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
   1548                                            NEG_RCP_LO, RCP_LO,
   1549                                            ISD::SETEQ);
   1550   // Calculate the rounding error from the URECIP instruction
   1551   // E = mulhu(ABS_RCP_LO, RCP)
   1552   SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
   1553 
   1554   // RCP_A_E = RCP + E
   1555   SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
   1556 
   1557   // RCP_S_E = RCP - E
   1558   SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
   1559 
   1560   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
   1561   SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
   1562                                      RCP_A_E, RCP_S_E,
   1563                                      ISD::SETEQ);
   1564   // Quotient = mulhu(Tmp0, Num)
   1565   SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
   1566 
   1567   // Num_S_Remainder = Quotient * Den
   1568   SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den);
   1569 
   1570   // Remainder = Num - Num_S_Remainder
   1571   SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
   1572 
   1573   // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
   1574   SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
   1575                                                  DAG.getConstant(-1, VT),
   1576                                                  DAG.getConstant(0, VT),
   1577                                                  ISD::SETUGE);
   1578   // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
   1579   SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
   1580                                                   Num_S_Remainder,
   1581                                                   DAG.getConstant(-1, VT),
   1582                                                   DAG.getConstant(0, VT),
   1583                                                   ISD::SETUGE);
   1584   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
   1585   SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
   1586                                                Remainder_GE_Zero);
   1587 
   1588   // Calculate Division result:
   1589 
   1590   // Quotient_A_One = Quotient + 1
   1591   SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
   1592                                                          DAG.getConstant(1, VT));
   1593 
   1594   // Quotient_S_One = Quotient - 1
   1595   SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
   1596                                                          DAG.getConstant(1, VT));
   1597 
   1598   // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
   1599   SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
   1600                                      Quotient, Quotient_A_One, ISD::SETEQ);
   1601 
   1602   // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
   1603   Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
   1604                             Quotient_S_One, Div, ISD::SETEQ);
   1605 
   1606   // Calculate Rem result:
   1607 
   1608   // Remainder_S_Den = Remainder - Den
   1609   SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
   1610 
   1611   // Remainder_A_Den = Remainder + Den
   1612   SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
   1613 
   1614   // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
   1615   SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
   1616                                     Remainder, Remainder_S_Den, ISD::SETEQ);
   1617 
   1618   // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
   1619   Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
   1620                             Remainder_A_Den, Rem, ISD::SETEQ);
   1621   SDValue Ops[2] = {
   1622     Div,
   1623     Rem
   1624   };
   1625   return DAG.getMergeValues(Ops, DL);
   1626 }
   1627 
   1628 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
   1629                                            SelectionDAG &DAG) const {
   1630   SDLoc DL(Op);
   1631   EVT VT = Op.getValueType();
   1632 
   1633   SDValue Zero = DAG.getConstant(0, VT);
   1634   SDValue NegOne = DAG.getConstant(-1, VT);
   1635 
   1636   SDValue LHS = Op.getOperand(0);
   1637   SDValue RHS = Op.getOperand(1);
   1638 
   1639   SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
   1640   SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
   1641   SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
   1642   SDValue RSign = LHSign; // Remainder sign is the same as LHS
   1643 
   1644   LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
   1645   RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
   1646 
   1647   LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
   1648   RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
   1649 
   1650   SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
   1651   SDValue Rem = Div.getValue(1);
   1652 
   1653   Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
   1654   Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
   1655 
   1656   Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
   1657   Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
   1658 
   1659   SDValue Res[2] = {
   1660     Div,
   1661     Rem
   1662   };
   1663   return DAG.getMergeValues(Res, DL);
   1664 }
   1665 
   1666 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
   1667   SDLoc SL(Op);
   1668   SDValue Src = Op.getOperand(0);
   1669 
   1670   // result = trunc(src)
   1671   // if (src > 0.0 && src != result)
   1672   //   result += 1.0
   1673 
   1674   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
   1675 
   1676   const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64);
   1677   const SDValue One = DAG.getConstantFP(1.0, MVT::f64);
   1678 
   1679   EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
   1680 
   1681   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
   1682   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
   1683   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
   1684 
   1685   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
   1686   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
   1687 }
   1688 
   1689 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
   1690   SDLoc SL(Op);
   1691   SDValue Src = Op.getOperand(0);
   1692 
   1693   assert(Op.getValueType() == MVT::f64);
   1694 
   1695   const SDValue Zero = DAG.getConstant(0, MVT::i32);
   1696   const SDValue One = DAG.getConstant(1, MVT::i32);
   1697 
   1698   SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
   1699 
   1700   // Extract the upper half, since this is where we will find the sign and
   1701   // exponent.
   1702   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
   1703 
   1704   const unsigned FractBits = 52;
   1705   const unsigned ExpBits = 11;
   1706 
   1707   // Extract the exponent.
   1708   SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_I32, SL, MVT::i32,
   1709                                 Hi,
   1710                                 DAG.getConstant(FractBits - 32, MVT::i32),
   1711                                 DAG.getConstant(ExpBits, MVT::i32));
   1712   SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
   1713                             DAG.getConstant(1023, MVT::i32));
   1714 
   1715   // Extract the sign bit.
   1716   const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32);
   1717   SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
   1718 
   1719   // Extend back to to 64-bits.
   1720   SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
   1721                                   Zero, SignBit);
   1722   SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
   1723 
   1724   SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
   1725   const SDValue FractMask
   1726     = DAG.getConstant((UINT64_C(1) << FractBits) - 1, MVT::i64);
   1727 
   1728   SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
   1729   SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
   1730   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
   1731 
   1732   EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
   1733 
   1734   const SDValue FiftyOne = DAG.getConstant(FractBits - 1, MVT::i32);
   1735 
   1736   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
   1737   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
   1738 
   1739   SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
   1740   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
   1741 
   1742   return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
   1743 }
   1744 
   1745 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
   1746   SDLoc SL(Op);
   1747   SDValue Src = Op.getOperand(0);
   1748 
   1749   assert(Op.getValueType() == MVT::f64);
   1750 
   1751   APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52");
   1752   SDValue C1 = DAG.getConstantFP(C1Val, MVT::f64);
   1753   SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
   1754 
   1755   SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
   1756   SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
   1757 
   1758   SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
   1759 
   1760   APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51");
   1761   SDValue C2 = DAG.getConstantFP(C2Val, MVT::f64);
   1762 
   1763   EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
   1764   SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
   1765 
   1766   return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
   1767 }
   1768 
   1769 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
   1770   // FNEARBYINT and FRINT are the same, except in their handling of FP
   1771   // exceptions. Those aren't really meaningful for us, and OpenCL only has
   1772   // rint, so just treat them as equivalent.
   1773   return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
   1774 }
   1775 
   1776 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
   1777   SDLoc SL(Op);
   1778   SDValue Src = Op.getOperand(0);
   1779 
   1780   // result = trunc(src);
   1781   // if (src < 0.0 && src != result)
   1782   //   result += -1.0.
   1783 
   1784   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
   1785 
   1786   const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64);
   1787   const SDValue NegOne = DAG.getConstantFP(-1.0, MVT::f64);
   1788 
   1789   EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
   1790 
   1791   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
   1792   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
   1793   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
   1794 
   1795   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
   1796   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
   1797 }
   1798 
   1799 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
   1800                                                SelectionDAG &DAG) const {
   1801   SDValue S0 = Op.getOperand(0);
   1802   SDLoc DL(Op);
   1803   if (Op.getValueType() != MVT::f32 || S0.getValueType() != MVT::i64)
   1804     return SDValue();
   1805 
   1806   // f32 uint_to_fp i64
   1807   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
   1808                            DAG.getConstant(0, MVT::i32));
   1809   SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo);
   1810   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
   1811                            DAG.getConstant(1, MVT::i32));
   1812   SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi);
   1813   FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi,
   1814                         DAG.getConstantFP(4294967296.0f, MVT::f32)); // 2^32
   1815   return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);
   1816 }
   1817 
   1818 SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op,
   1819                                                       unsigned BitsDiff,
   1820                                                       SelectionDAG &DAG) const {
   1821   MVT VT = Op.getSimpleValueType();
   1822   SDLoc DL(Op);
   1823   SDValue Shift = DAG.getConstant(BitsDiff, VT);
   1824   // Shift left by 'Shift' bits.
   1825   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Op.getOperand(0), Shift);
   1826   // Signed shift Right by 'Shift' bits.
   1827   return DAG.getNode(ISD::SRA, DL, VT, Shl, Shift);
   1828 }
   1829 
   1830 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
   1831                                                      SelectionDAG &DAG) const {
   1832   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
   1833   MVT VT = Op.getSimpleValueType();
   1834   MVT ScalarVT = VT.getScalarType();
   1835 
   1836   if (!VT.isVector())
   1837     return SDValue();
   1838 
   1839   SDValue Src = Op.getOperand(0);
   1840   SDLoc DL(Op);
   1841 
   1842   // TODO: Don't scalarize on Evergreen?
   1843   unsigned NElts = VT.getVectorNumElements();
   1844   SmallVector<SDValue, 8> Args;
   1845   DAG.ExtractVectorElements(Src, Args, 0, NElts);
   1846 
   1847   SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
   1848   for (unsigned I = 0; I < NElts; ++I)
   1849     Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
   1850 
   1851   return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args);
   1852 }
   1853 
   1854 //===----------------------------------------------------------------------===//
   1855 // Custom DAG optimizations
   1856 //===----------------------------------------------------------------------===//
   1857 
   1858 static bool isU24(SDValue Op, SelectionDAG &DAG) {
   1859   APInt KnownZero, KnownOne;
   1860   EVT VT = Op.getValueType();
   1861   DAG.computeKnownBits(Op, KnownZero, KnownOne);
   1862 
   1863   return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;
   1864 }
   1865 
   1866 static bool isI24(SDValue Op, SelectionDAG &DAG) {
   1867   EVT VT = Op.getValueType();
   1868 
   1869   // In order for this to be a signed 24-bit value, bit 23, must
   1870   // be a sign bit.
   1871   return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
   1872                                      // as unsigned 24-bit values.
   1873          (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
   1874 }
   1875 
   1876 static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) {
   1877 
   1878   SelectionDAG &DAG = DCI.DAG;
   1879   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   1880   EVT VT = Op.getValueType();
   1881 
   1882   APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
   1883   APInt KnownZero, KnownOne;
   1884   TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
   1885   if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
   1886     DCI.CommitTargetLoweringOpt(TLO);
   1887 }
   1888 
   1889 template <typename IntTy>
   1890 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
   1891                                uint32_t Offset, uint32_t Width) {
   1892   if (Width + Offset < 32) {
   1893     IntTy Result = (Src0 << (32 - Offset - Width)) >> (32 - Width);
   1894     return DAG.getConstant(Result, MVT::i32);
   1895   }
   1896 
   1897   return DAG.getConstant(Src0 >> Offset, MVT::i32);
   1898 }
   1899 
   1900 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
   1901                                                 DAGCombinerInfo &DCI) const {
   1902   EVT VT = N->getValueType(0);
   1903 
   1904   if (VT.isVector() || VT.getSizeInBits() > 32)
   1905     return SDValue();
   1906 
   1907   SelectionDAG &DAG = DCI.DAG;
   1908   SDLoc DL(N);
   1909 
   1910   SDValue N0 = N->getOperand(0);
   1911   SDValue N1 = N->getOperand(1);
   1912   SDValue Mul;
   1913 
   1914   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
   1915     N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
   1916     N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
   1917     Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
   1918   } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
   1919     N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
   1920     N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
   1921     Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
   1922   } else {
   1923     return SDValue();
   1924   }
   1925 
   1926   // We need to use sext even for MUL_U24, because MUL_U24 is used
   1927   // for signed multiply of 8 and 16-bit types.
   1928   return DAG.getSExtOrTrunc(Mul, DL, VT);
   1929 }
   1930 
   1931 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
   1932                                             DAGCombinerInfo &DCI) const {
   1933   SelectionDAG &DAG = DCI.DAG;
   1934   SDLoc DL(N);
   1935 
   1936   switch(N->getOpcode()) {
   1937     default: break;
   1938     case ISD::MUL:
   1939       return performMulCombine(N, DCI);
   1940     case AMDGPUISD::MUL_I24:
   1941     case AMDGPUISD::MUL_U24: {
   1942       SDValue N0 = N->getOperand(0);
   1943       SDValue N1 = N->getOperand(1);
   1944       simplifyI24(N0, DCI);
   1945       simplifyI24(N1, DCI);
   1946       return SDValue();
   1947     }
   1948     case ISD::SELECT_CC: {
   1949       return CombineMinMax(N, DAG);
   1950     }
   1951   case AMDGPUISD::BFE_I32:
   1952   case AMDGPUISD::BFE_U32: {
   1953     assert(!N->getValueType(0).isVector() &&
   1954            "Vector handling of BFE not implemented");
   1955     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
   1956     if (!Width)
   1957       break;
   1958 
   1959     uint32_t WidthVal = Width->getZExtValue() & 0x1f;
   1960     if (WidthVal == 0)
   1961       return DAG.getConstant(0, MVT::i32);
   1962 
   1963     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
   1964     if (!Offset)
   1965       break;
   1966 
   1967     SDValue BitsFrom = N->getOperand(0);
   1968     uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
   1969 
   1970     bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
   1971 
   1972     if (OffsetVal == 0) {
   1973       // This is already sign / zero extended, so try to fold away extra BFEs.
   1974       unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
   1975 
   1976       unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
   1977       if (OpSignBits >= SignBits)
   1978         return BitsFrom;
   1979 
   1980       EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
   1981       if (Signed) {
   1982         // This is a sign_extend_inreg. Replace it to take advantage of existing
   1983         // DAG Combines. If not eliminated, we will match back to BFE during
   1984         // selection.
   1985 
   1986         // TODO: The sext_inreg of extended types ends, although we can could
   1987         // handle them in a single BFE.
   1988         return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
   1989                            DAG.getValueType(SmallVT));
   1990       }
   1991 
   1992       return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
   1993     }
   1994 
   1995     if (ConstantSDNode *Val = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
   1996       if (Signed) {
   1997         return constantFoldBFE<int32_t>(DAG,
   1998                                         Val->getSExtValue(),
   1999                                         OffsetVal,
   2000                                         WidthVal);
   2001       }
   2002 
   2003       return constantFoldBFE<uint32_t>(DAG,
   2004                                        Val->getZExtValue(),
   2005                                        OffsetVal,
   2006                                        WidthVal);
   2007     }
   2008 
   2009     APInt Demanded = APInt::getBitsSet(32,
   2010                                        OffsetVal,
   2011                                        OffsetVal + WidthVal);
   2012 
   2013     if ((OffsetVal + WidthVal) >= 32) {
   2014       SDValue ShiftVal = DAG.getConstant(OffsetVal, MVT::i32);
   2015       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
   2016                          BitsFrom, ShiftVal);
   2017     }
   2018 
   2019     APInt KnownZero, KnownOne;
   2020     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
   2021                                           !DCI.isBeforeLegalizeOps());
   2022     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   2023     if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
   2024         TLI.SimplifyDemandedBits(BitsFrom, Demanded, KnownZero, KnownOne, TLO)) {
   2025       DCI.CommitTargetLoweringOpt(TLO);
   2026     }
   2027 
   2028     break;
   2029   }
   2030   }
   2031   return SDValue();
   2032 }
   2033 
   2034 //===----------------------------------------------------------------------===//
   2035 // Helper functions
   2036 //===----------------------------------------------------------------------===//
   2037 
   2038 void AMDGPUTargetLowering::getOriginalFunctionArgs(
   2039                                SelectionDAG &DAG,
   2040                                const Function *F,
   2041                                const SmallVectorImpl<ISD::InputArg> &Ins,
   2042                                SmallVectorImpl<ISD::InputArg> &OrigIns) const {
   2043 
   2044   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
   2045     if (Ins[i].ArgVT == Ins[i].VT) {
   2046       OrigIns.push_back(Ins[i]);
   2047       continue;
   2048     }
   2049 
   2050     EVT VT;
   2051     if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) {
   2052       // Vector has been split into scalars.
   2053       VT = Ins[i].ArgVT.getVectorElementType();
   2054     } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() &&
   2055                Ins[i].ArgVT.getVectorElementType() !=
   2056                Ins[i].VT.getVectorElementType()) {
   2057       // Vector elements have been promoted
   2058       VT = Ins[i].ArgVT;
   2059     } else {
   2060       // Vector has been spilt into smaller vectors.
   2061       VT = Ins[i].VT;
   2062     }
   2063 
   2064     ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used,
   2065                       Ins[i].OrigArgIndex, Ins[i].PartOffset);
   2066     OrigIns.push_back(Arg);
   2067   }
   2068 }
   2069 
   2070 bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
   2071   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
   2072     return CFP->isExactlyValue(1.0);
   2073   }
   2074   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   2075     return C->isAllOnesValue();
   2076   }
   2077   return false;
   2078 }
   2079 
   2080 bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const {
   2081   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
   2082     return CFP->getValueAPF().isZero();
   2083   }
   2084   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
   2085     return C->isNullValue();
   2086   }
   2087   return false;
   2088 }
   2089 
   2090 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
   2091                                                   const TargetRegisterClass *RC,
   2092                                                    unsigned Reg, EVT VT) const {
   2093   MachineFunction &MF = DAG.getMachineFunction();
   2094   MachineRegisterInfo &MRI = MF.getRegInfo();
   2095   unsigned VirtualRegister;
   2096   if (!MRI.isLiveIn(Reg)) {
   2097     VirtualRegister = MRI.createVirtualRegister(RC);
   2098     MRI.addLiveIn(Reg, VirtualRegister);
   2099   } else {
   2100     VirtualRegister = MRI.getLiveInVirtReg(Reg);
   2101   }
   2102   return DAG.getRegister(VirtualRegister, VT);
   2103 }
   2104 
   2105 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
   2106 
   2107 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   2108   switch (Opcode) {
   2109   default: return nullptr;
   2110   // AMDIL DAG nodes
   2111   NODE_NAME_CASE(CALL);
   2112   NODE_NAME_CASE(UMUL);
   2113   NODE_NAME_CASE(RET_FLAG);
   2114   NODE_NAME_CASE(BRANCH_COND);
   2115 
   2116   // AMDGPU DAG nodes
   2117   NODE_NAME_CASE(DWORDADDR)
   2118   NODE_NAME_CASE(FRACT)
   2119   NODE_NAME_CASE(CLAMP)
   2120   NODE_NAME_CASE(FMAX)
   2121   NODE_NAME_CASE(SMAX)
   2122   NODE_NAME_CASE(UMAX)
   2123   NODE_NAME_CASE(FMIN)
   2124   NODE_NAME_CASE(SMIN)
   2125   NODE_NAME_CASE(UMIN)
   2126   NODE_NAME_CASE(URECIP)
   2127   NODE_NAME_CASE(DIV_SCALE)
   2128   NODE_NAME_CASE(DIV_FMAS)
   2129   NODE_NAME_CASE(DIV_FIXUP)
   2130   NODE_NAME_CASE(TRIG_PREOP)
   2131   NODE_NAME_CASE(RCP)
   2132   NODE_NAME_CASE(RSQ)
   2133   NODE_NAME_CASE(RSQ_LEGACY)
   2134   NODE_NAME_CASE(RSQ_CLAMPED)
   2135   NODE_NAME_CASE(DOT4)
   2136   NODE_NAME_CASE(BFE_U32)
   2137   NODE_NAME_CASE(BFE_I32)
   2138   NODE_NAME_CASE(BFI)
   2139   NODE_NAME_CASE(BFM)
   2140   NODE_NAME_CASE(BREV)
   2141   NODE_NAME_CASE(MUL_U24)
   2142   NODE_NAME_CASE(MUL_I24)
   2143   NODE_NAME_CASE(MAD_U24)
   2144   NODE_NAME_CASE(MAD_I24)
   2145   NODE_NAME_CASE(EXPORT)
   2146   NODE_NAME_CASE(CONST_ADDRESS)
   2147   NODE_NAME_CASE(REGISTER_LOAD)
   2148   NODE_NAME_CASE(REGISTER_STORE)
   2149   NODE_NAME_CASE(LOAD_CONSTANT)
   2150   NODE_NAME_CASE(LOAD_INPUT)
   2151   NODE_NAME_CASE(SAMPLE)
   2152   NODE_NAME_CASE(SAMPLEB)
   2153   NODE_NAME_CASE(SAMPLED)
   2154   NODE_NAME_CASE(SAMPLEL)
   2155   NODE_NAME_CASE(CVT_F32_UBYTE0)
   2156   NODE_NAME_CASE(CVT_F32_UBYTE1)
   2157   NODE_NAME_CASE(CVT_F32_UBYTE2)
   2158   NODE_NAME_CASE(CVT_F32_UBYTE3)
   2159   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
   2160   NODE_NAME_CASE(STORE_MSKOR)
   2161   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
   2162   }
   2163 }
   2164 
   2165 static void computeKnownBitsForMinMax(const SDValue Op0,
   2166                                       const SDValue Op1,
   2167                                       APInt &KnownZero,
   2168                                       APInt &KnownOne,
   2169                                       const SelectionDAG &DAG,
   2170                                       unsigned Depth) {
   2171   APInt Op0Zero, Op0One;
   2172   APInt Op1Zero, Op1One;
   2173   DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth);
   2174   DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth);
   2175 
   2176   KnownZero = Op0Zero & Op1Zero;
   2177   KnownOne = Op0One & Op1One;
   2178 }
   2179 
   2180 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
   2181   const SDValue Op,
   2182   APInt &KnownZero,
   2183   APInt &KnownOne,
   2184   const SelectionDAG &DAG,
   2185   unsigned Depth) const {
   2186 
   2187   KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
   2188 
   2189   APInt KnownZero2;
   2190   APInt KnownOne2;
   2191   unsigned Opc = Op.getOpcode();
   2192 
   2193   switch (Opc) {
   2194   default:
   2195     break;
   2196   case ISD::INTRINSIC_WO_CHAIN: {
   2197     // FIXME: The intrinsic should just use the node.
   2198     switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
   2199     case AMDGPUIntrinsic::AMDGPU_imax:
   2200     case AMDGPUIntrinsic::AMDGPU_umax:
   2201     case AMDGPUIntrinsic::AMDGPU_imin:
   2202     case AMDGPUIntrinsic::AMDGPU_umin:
   2203       computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
   2204                                 KnownZero, KnownOne, DAG, Depth);
   2205       break;
   2206     default:
   2207       break;
   2208     }
   2209 
   2210     break;
   2211   }
   2212   case AMDGPUISD::SMAX:
   2213   case AMDGPUISD::UMAX:
   2214   case AMDGPUISD::SMIN:
   2215   case AMDGPUISD::UMIN:
   2216     computeKnownBitsForMinMax(Op.getOperand(0), Op.getOperand(1),
   2217                               KnownZero, KnownOne, DAG, Depth);
   2218     break;
   2219 
   2220   case AMDGPUISD::BFE_I32:
   2221   case AMDGPUISD::BFE_U32: {
   2222     ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
   2223     if (!CWidth)
   2224       return;
   2225 
   2226     unsigned BitWidth = 32;
   2227     uint32_t Width = CWidth->getZExtValue() & 0x1f;
   2228     if (Width == 0) {
   2229       KnownZero = APInt::getAllOnesValue(BitWidth);
   2230       KnownOne = APInt::getNullValue(BitWidth);
   2231       return;
   2232     }
   2233 
   2234     // FIXME: This could do a lot more. If offset is 0, should be the same as
   2235     // sign_extend_inreg implementation, but that involves duplicating it.
   2236     if (Opc == AMDGPUISD::BFE_I32)
   2237       KnownOne = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
   2238     else
   2239       KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
   2240 
   2241     break;
   2242   }
   2243   }
   2244 }
   2245 
   2246 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
   2247   SDValue Op,
   2248   const SelectionDAG &DAG,
   2249   unsigned Depth) const {
   2250   switch (Op.getOpcode()) {
   2251   case AMDGPUISD::BFE_I32: {
   2252     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
   2253     if (!Width)
   2254       return 1;
   2255 
   2256     unsigned SignBits = 32 - Width->getZExtValue() + 1;
   2257     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1));
   2258     if (!Offset || !Offset->isNullValue())
   2259       return SignBits;
   2260 
   2261     // TODO: Could probably figure something out with non-0 offsets.
   2262     unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
   2263     return std::max(SignBits, Op0SignBits);
   2264   }
   2265 
   2266   case AMDGPUISD::BFE_U32: {
   2267     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
   2268     return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
   2269   }
   2270 
   2271   default:
   2272     return 1;
   2273   }
   2274 }
   2275