Home | History | Annotate | Download | only in AMDGPU
      1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 /// \brief This is the parent TargetLowering class for hardware code gen
     12 /// targets.
     13 //
     14 //===----------------------------------------------------------------------===//
     15 
     16 #include "AMDGPUISelLowering.h"
     17 #include "AMDGPU.h"
     18 #include "AMDGPUFrameLowering.h"
     19 #include "AMDGPUIntrinsicInfo.h"
     20 #include "AMDGPURegisterInfo.h"
     21 #include "AMDGPUSubtarget.h"
     22 #include "R600MachineFunctionInfo.h"
     23 #include "SIMachineFunctionInfo.h"
     24 #include "llvm/CodeGen/CallingConvLower.h"
     25 #include "llvm/CodeGen/MachineFunction.h"
     26 #include "llvm/CodeGen/MachineRegisterInfo.h"
     27 #include "llvm/CodeGen/SelectionDAG.h"
     28 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
     29 #include "llvm/IR/DataLayout.h"
     30 #include "llvm/IR/DiagnosticInfo.h"
     31 #include "SIInstrInfo.h"
     32 using namespace llvm;
     33 
     34 static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
     35                             CCValAssign::LocInfo LocInfo,
     36                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
     37   MachineFunction &MF = State.getMachineFunction();
     38   AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
     39 
     40   uint64_t Offset = MFI->allocateKernArg(ValVT.getStoreSize(),
     41                                          ArgFlags.getOrigAlign());
     42   State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
     43   return true;
     44 }
     45 
     46 #include "AMDGPUGenCallingConv.inc"
     47 
     48 // Find a larger type to do a load / store of a vector with.
     49 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
     50   unsigned StoreSize = VT.getStoreSizeInBits();
     51   if (StoreSize <= 32)
     52     return EVT::getIntegerVT(Ctx, StoreSize);
     53 
     54   assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
     55   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
     56 }
     57 
     58 EVT AMDGPUTargetLowering::getEquivalentBitType(LLVMContext &Ctx, EVT VT) {
     59   unsigned StoreSize = VT.getStoreSizeInBits();
     60   if (StoreSize <= 32)
     61     return EVT::getIntegerVT(Ctx, StoreSize);
     62 
     63   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
     64 }
     65 
     66 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
     67                                            const AMDGPUSubtarget &STI)
     68     : TargetLowering(TM), Subtarget(&STI) {
     69   // Lower floating point store/load to integer store/load to reduce the number
     70   // of patterns in tablegen.
     71   setOperationAction(ISD::LOAD, MVT::f32, Promote);
     72   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
     73 
     74   setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
     75   AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
     76 
     77   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
     78   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
     79 
     80   setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
     81   AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
     82 
     83   setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
     84   AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
     85 
     86   setOperationAction(ISD::LOAD, MVT::i64, Promote);
     87   AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
     88 
     89   setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
     90   AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
     91 
     92   setOperationAction(ISD::LOAD, MVT::f64, Promote);
     93   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
     94 
     95   setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
     96   AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
     97 
     98   // There are no 64-bit extloads. These should be done as a 32-bit extload and
     99   // an extension to 64-bit.
    100   for (MVT VT : MVT::integer_valuetypes()) {
    101     setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
    102     setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
    103     setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
    104   }
    105 
    106   for (MVT VT : MVT::integer_valuetypes()) {
    107     if (VT == MVT::i64)
    108       continue;
    109 
    110     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
    111     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
    112     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
    113     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
    114 
    115     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
    116     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
    117     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
    118     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
    119 
    120     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
    121     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
    122     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
    123     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
    124   }
    125 
    126   for (MVT VT : MVT::integer_vector_valuetypes()) {
    127     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
    128     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
    129     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
    130     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
    131     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
    132     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
    133     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
    134     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
    135     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
    136     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
    137     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
    138     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
    139   }
    140 
    141   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
    142   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
    143   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
    144   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
    145 
    146   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
    147   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
    148   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
    149   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
    150 
    151   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
    152   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
    153   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
    154   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
    155 
    156   setOperationAction(ISD::STORE, MVT::f32, Promote);
    157   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
    158 
    159   setOperationAction(ISD::STORE, MVT::v2f32, Promote);
    160   AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
    161 
    162   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
    163   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
    164 
    165   setOperationAction(ISD::STORE, MVT::v8f32, Promote);
    166   AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
    167 
    168   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
    169   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
    170 
    171   setOperationAction(ISD::STORE, MVT::i64, Promote);
    172   AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
    173 
    174   setOperationAction(ISD::STORE, MVT::v2i64, Promote);
    175   AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
    176 
    177   setOperationAction(ISD::STORE, MVT::f64, Promote);
    178   AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
    179 
    180   setOperationAction(ISD::STORE, MVT::v2f64, Promote);
    181   AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
    182 
    183   setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
    184   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
    185 
    186   setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
    187   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
    188 
    189   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
    190   setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
    191   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
    192 
    193   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
    194   setTruncStoreAction(MVT::i64, MVT::i8, Expand);
    195   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
    196   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
    197 
    198   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
    199   setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
    200   setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
    201   setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
    202 
    203   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
    204   setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
    205   setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
    206   setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
    207 
    208   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
    209   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
    210 
    211   setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
    212   setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
    213 
    214   setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
    215   setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
    216 
    217   setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
    218   setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
    219 
    220 
    221   setOperationAction(ISD::Constant, MVT::i32, Legal);
    222   setOperationAction(ISD::Constant, MVT::i64, Legal);
    223   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
    224   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
    225 
    226   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
    227   setOperationAction(ISD::BRIND, MVT::Other, Expand);
    228 
    229   // This is totally unsupported, just custom lower to produce an error.
    230   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
    231 
    232   // We need to custom lower some of the intrinsics
    233   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
    234   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
    235 
    236   // Library functions.  These default to Expand, but we have instructions
    237   // for them.
    238   setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
    239   setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
    240   setOperationAction(ISD::FPOW,   MVT::f32, Legal);
    241   setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
    242   setOperationAction(ISD::FABS,   MVT::f32, Legal);
    243   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
    244   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
    245   setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
    246   setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
    247   setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
    248 
    249   setOperationAction(ISD::FROUND, MVT::f32, Custom);
    250   setOperationAction(ISD::FROUND, MVT::f64, Custom);
    251 
    252   setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
    253   setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
    254 
    255   setOperationAction(ISD::FREM, MVT::f32, Custom);
    256   setOperationAction(ISD::FREM, MVT::f64, Custom);
    257 
    258   // v_mad_f32 does not support denormals according to some sources.
    259   if (!Subtarget->hasFP32Denormals())
    260     setOperationAction(ISD::FMAD, MVT::f32, Legal);
    261 
    262   // Expand to fneg + fadd.
    263   setOperationAction(ISD::FSUB, MVT::f64, Expand);
    264 
    265   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
    266   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
    267   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
    268   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
    269   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
    270   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
    271   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
    272   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
    273   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
    274   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
    275 
    276   if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
    277     setOperationAction(ISD::FCEIL, MVT::f64, Custom);
    278     setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
    279     setOperationAction(ISD::FRINT, MVT::f64, Custom);
    280     setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
    281   }
    282 
    283   if (!Subtarget->hasBFI()) {
    284     // fcopysign can be done in a single instruction with BFI.
    285     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
    286     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    287   }
    288 
    289   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
    290 
    291   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
    292   for (MVT VT : ScalarIntVTs) {
    293     // These should use [SU]DIVREM, so set them to expand
    294     setOperationAction(ISD::SDIV, VT, Expand);
    295     setOperationAction(ISD::UDIV, VT, Expand);
    296     setOperationAction(ISD::SREM, VT, Expand);
    297     setOperationAction(ISD::UREM, VT, Expand);
    298 
    299     // GPU does not have divrem function for signed or unsigned.
    300     setOperationAction(ISD::SDIVREM, VT, Custom);
    301     setOperationAction(ISD::UDIVREM, VT, Custom);
    302 
    303     // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
    304     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
    305     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
    306 
    307     setOperationAction(ISD::BSWAP, VT, Expand);
    308     setOperationAction(ISD::CTTZ, VT, Expand);
    309     setOperationAction(ISD::CTLZ, VT, Expand);
    310   }
    311 
    312   if (!Subtarget->hasBCNT(32))
    313     setOperationAction(ISD::CTPOP, MVT::i32, Expand);
    314 
    315   if (!Subtarget->hasBCNT(64))
    316     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
    317 
    318   // The hardware supports 32-bit ROTR, but not ROTL.
    319   setOperationAction(ISD::ROTL, MVT::i32, Expand);
    320   setOperationAction(ISD::ROTL, MVT::i64, Expand);
    321   setOperationAction(ISD::ROTR, MVT::i64, Expand);
    322 
    323   setOperationAction(ISD::MUL, MVT::i64, Expand);
    324   setOperationAction(ISD::MULHU, MVT::i64, Expand);
    325   setOperationAction(ISD::MULHS, MVT::i64, Expand);
    326   setOperationAction(ISD::UDIV, MVT::i32, Expand);
    327   setOperationAction(ISD::UREM, MVT::i32, Expand);
    328   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
    329   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
    330   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
    331   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
    332   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
    333 
    334   setOperationAction(ISD::SMIN, MVT::i32, Legal);
    335   setOperationAction(ISD::UMIN, MVT::i32, Legal);
    336   setOperationAction(ISD::SMAX, MVT::i32, Legal);
    337   setOperationAction(ISD::UMAX, MVT::i32, Legal);
    338 
    339   if (Subtarget->hasFFBH())
    340     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
    341 
    342   if (Subtarget->hasFFBL())
    343     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal);
    344 
    345   setOperationAction(ISD::CTLZ, MVT::i64, Custom);
    346   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
    347 
    348   // We only really have 32-bit BFE instructions (and 16-bit on VI).
    349   //
    350   // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
    351   // effort to match them now. We want this to be false for i64 cases when the
    352   // extraction isn't restricted to the upper or lower half. Ideally we would
    353   // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
    354   // span the midpoint are probably relatively rare, so don't worry about them
    355   // for now.
    356   if (Subtarget->hasBFE())
    357     setHasExtractBitsInsn(true);
    358 
    359   static const MVT::SimpleValueType VectorIntTypes[] = {
    360     MVT::v2i32, MVT::v4i32
    361   };
    362 
    363   for (MVT VT : VectorIntTypes) {
    364     // Expand the following operations for the current type by default.
    365     setOperationAction(ISD::ADD,  VT, Expand);
    366     setOperationAction(ISD::AND,  VT, Expand);
    367     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
    368     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
    369     setOperationAction(ISD::MUL,  VT, Expand);
    370     setOperationAction(ISD::OR,   VT, Expand);
    371     setOperationAction(ISD::SHL,  VT, Expand);
    372     setOperationAction(ISD::SRA,  VT, Expand);
    373     setOperationAction(ISD::SRL,  VT, Expand);
    374     setOperationAction(ISD::ROTL, VT, Expand);
    375     setOperationAction(ISD::ROTR, VT, Expand);
    376     setOperationAction(ISD::SUB,  VT, Expand);
    377     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
    378     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
    379     setOperationAction(ISD::SDIV, VT, Expand);
    380     setOperationAction(ISD::UDIV, VT, Expand);
    381     setOperationAction(ISD::SREM, VT, Expand);
    382     setOperationAction(ISD::UREM, VT, Expand);
    383     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
    384     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
    385     setOperationAction(ISD::SDIVREM, VT, Custom);
    386     setOperationAction(ISD::UDIVREM, VT, Expand);
    387     setOperationAction(ISD::ADDC, VT, Expand);
    388     setOperationAction(ISD::SUBC, VT, Expand);
    389     setOperationAction(ISD::ADDE, VT, Expand);
    390     setOperationAction(ISD::SUBE, VT, Expand);
    391     setOperationAction(ISD::SELECT, VT, Expand);
    392     setOperationAction(ISD::VSELECT, VT, Expand);
    393     setOperationAction(ISD::SELECT_CC, VT, Expand);
    394     setOperationAction(ISD::XOR,  VT, Expand);
    395     setOperationAction(ISD::BSWAP, VT, Expand);
    396     setOperationAction(ISD::CTPOP, VT, Expand);
    397     setOperationAction(ISD::CTTZ, VT, Expand);
    398     setOperationAction(ISD::CTLZ, VT, Expand);
    399     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
    400   }
    401 
    402   static const MVT::SimpleValueType FloatVectorTypes[] = {
    403     MVT::v2f32, MVT::v4f32
    404   };
    405 
    406   for (MVT VT : FloatVectorTypes) {
    407     setOperationAction(ISD::FABS, VT, Expand);
    408     setOperationAction(ISD::FMINNUM, VT, Expand);
    409     setOperationAction(ISD::FMAXNUM, VT, Expand);
    410     setOperationAction(ISD::FADD, VT, Expand);
    411     setOperationAction(ISD::FCEIL, VT, Expand);
    412     setOperationAction(ISD::FCOS, VT, Expand);
    413     setOperationAction(ISD::FDIV, VT, Expand);
    414     setOperationAction(ISD::FEXP2, VT, Expand);
    415     setOperationAction(ISD::FLOG2, VT, Expand);
    416     setOperationAction(ISD::FREM, VT, Expand);
    417     setOperationAction(ISD::FPOW, VT, Expand);
    418     setOperationAction(ISD::FFLOOR, VT, Expand);
    419     setOperationAction(ISD::FTRUNC, VT, Expand);
    420     setOperationAction(ISD::FMUL, VT, Expand);
    421     setOperationAction(ISD::FMA, VT, Expand);
    422     setOperationAction(ISD::FRINT, VT, Expand);
    423     setOperationAction(ISD::FNEARBYINT, VT, Expand);
    424     setOperationAction(ISD::FSQRT, VT, Expand);
    425     setOperationAction(ISD::FSIN, VT, Expand);
    426     setOperationAction(ISD::FSUB, VT, Expand);
    427     setOperationAction(ISD::FNEG, VT, Expand);
    428     setOperationAction(ISD::VSELECT, VT, Expand);
    429     setOperationAction(ISD::SELECT_CC, VT, Expand);
    430     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
    431     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
    432   }
    433 
    434   // This causes using an unrolled select operation rather than expansion with
    435   // bit operations. This is in general better, but the alternative using BFI
    436   // instructions may be better if the select sources are SGPRs.
    437   setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
    438   AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
    439 
    440   setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
    441   AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
    442 
    443   setBooleanContents(ZeroOrNegativeOneBooleanContent);
    444   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
    445 
    446   setSchedulingPreference(Sched::RegPressure);
    447   setJumpIsExpensive(true);
    448 
    449   // SI at least has hardware support for floating point exceptions, but no way
    450   // of using or handling them is implemented. They are also optional in OpenCL
    451   // (Section 7.3)
    452   setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
    453 
    454   setSelectIsExpensive(false);
    455   PredictableSelectIsExpensive = false;
    456 
    457   setFsqrtIsCheap(true);
    458 
    459   // We want to find all load dependencies for long chains of stores to enable
    460   // merging into very wide vectors. The problem is with vectors with > 4
    461   // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
    462   // vectors are a legal type, even though we have to split the loads
    463   // usually. When we can more precisely specify load legality per address
    464   // space, we should be able to make FindBetterChain/MergeConsecutiveStores
    465   // smarter so that they can figure out what to do in 2 iterations without all
    466   // N > 4 stores on the same chain.
    467   GatherAllAliasesMaxDepth = 16;
    468 
    469   // FIXME: Need to really handle these.
    470   MaxStoresPerMemcpy  = 4096;
    471   MaxStoresPerMemmove = 4096;
    472   MaxStoresPerMemset  = 4096;
    473 
    474   setTargetDAGCombine(ISD::BITCAST);
    475   setTargetDAGCombine(ISD::AND);
    476   setTargetDAGCombine(ISD::SHL);
    477   setTargetDAGCombine(ISD::SRA);
    478   setTargetDAGCombine(ISD::SRL);
    479   setTargetDAGCombine(ISD::MUL);
    480   setTargetDAGCombine(ISD::SELECT);
    481   setTargetDAGCombine(ISD::SELECT_CC);
    482   setTargetDAGCombine(ISD::STORE);
    483   setTargetDAGCombine(ISD::FADD);
    484   setTargetDAGCombine(ISD::FSUB);
    485 }
    486 
    487 //===----------------------------------------------------------------------===//
    488 // Target Information
    489 //===----------------------------------------------------------------------===//
    490 
    491 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
    492   return MVT::i32;
    493 }
    494 
    495 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
    496   return true;
    497 }
    498 
    499 // The backend supports 32 and 64 bit floating point immediates.
    500 // FIXME: Why are we reporting vectors of FP immediates as legal?
    501 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
    502   EVT ScalarVT = VT.getScalarType();
    503   return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64);
    504 }
    505 
    506 // We don't want to shrink f64 / f32 constants.
    507 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
    508   EVT ScalarVT = VT.getScalarType();
    509   return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
    510 }
    511 
    512 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
    513                                                  ISD::LoadExtType,
    514                                                  EVT NewVT) const {
    515 
    516   unsigned NewSize = NewVT.getStoreSizeInBits();
    517 
    518   // If we are reducing to a 32-bit load, this is always better.
    519   if (NewSize == 32)
    520     return true;
    521 
    522   EVT OldVT = N->getValueType(0);
    523   unsigned OldSize = OldVT.getStoreSizeInBits();
    524 
    525   // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
    526   // extloads, so doing one requires using a buffer_load. In cases where we
    527   // still couldn't use a scalar load, using the wider load shouldn't really
    528   // hurt anything.
    529 
    530   // If the old size already had to be an extload, there's no harm in continuing
    531   // to reduce the width.
    532   return (OldSize < 32);
    533 }
    534 
    535 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
    536                                                    EVT CastTy) const {
    537 
    538   assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
    539 
    540   if (LoadTy.getScalarType() == MVT::i32)
    541     return false;
    542 
    543   unsigned LScalarSize = LoadTy.getScalarSizeInBits();
    544   unsigned CastScalarSize = CastTy.getScalarSizeInBits();
    545 
    546   return (LScalarSize < CastScalarSize) ||
    547          (CastScalarSize >= 32);
    548 }
    549 
    550 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
    551 // profitable with the expansion for 64-bit since it's generally good to
    552 // speculate things.
    553 // FIXME: These should really have the size as a parameter.
    554 bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
    555   return true;
    556 }
    557 
    558 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
    559   return true;
    560 }
    561 
    562 //===---------------------------------------------------------------------===//
    563 // Target Properties
    564 //===---------------------------------------------------------------------===//
    565 
    566 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
    567   assert(VT.isFloatingPoint());
    568   return VT == MVT::f32 || VT == MVT::f64;
    569 }
    570 
    571 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
    572   assert(VT.isFloatingPoint());
    573   return VT == MVT::f32 || VT == MVT::f64;
    574 }
    575 
    576 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
    577                                                          unsigned NumElem,
    578                                                          unsigned AS) const {
    579   return true;
    580 }
    581 
    582 bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
    583   // There are few operations which truly have vector input operands. Any vector
    584   // operation is going to involve operations on each component, and a
    585   // build_vector will be a copy per element, so it always makes sense to use a
    586   // build_vector input in place of the extracted element to avoid a copy into a
    587   // super register.
    588   //
    589   // We should probably only do this if all users are extracts only, but this
    590   // should be the common case.
    591   return true;
    592 }
    593 
    594 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
    595   // Truncate is just accessing a subregister.
    596   return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0);
    597 }
    598 
    599 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
    600   // Truncate is just accessing a subregister.
    601   return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() &&
    602          (Dest->getPrimitiveSizeInBits() % 32 == 0);
    603 }
    604 
    605 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
    606   unsigned SrcSize = Src->getScalarSizeInBits();
    607   unsigned DestSize = Dest->getScalarSizeInBits();
    608 
    609   return SrcSize == 32 && DestSize == 64;
    610 }
    611 
    612 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
    613   // Any register load of a 64-bit value really requires 2 32-bit moves. For all
    614   // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
    615   // this will enable reducing 64-bit operations the 32-bit, which is always
    616   // good.
    617   return Src == MVT::i32 && Dest == MVT::i64;
    618 }
    619 
    620 bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
    621   return isZExtFree(Val.getValueType(), VT2);
    622 }
    623 
    624 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
    625   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
    626   // limited number of native 64-bit operations. Shrinking an operation to fit
    627   // in a single 32-bit register should always be helpful. As currently used,
    628   // this is much less general than the name suggests, and is only used in
    629   // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
    630   // not profitable, and may actually be harmful.
    631   return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
    632 }
    633 
    634 //===---------------------------------------------------------------------===//
    635 // TargetLowering Callbacks
    636 //===---------------------------------------------------------------------===//
    637 
    638 void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
    639                              const SmallVectorImpl<ISD::InputArg> &Ins) const {
    640 
    641   State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
    642 }
    643 
    644 void AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
    645                            const SmallVectorImpl<ISD::OutputArg> &Outs) const {
    646 
    647   State.AnalyzeReturn(Outs, RetCC_SI);
    648 }
    649 
    650 SDValue
    651 AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
    652                                   bool isVarArg,
    653                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
    654                                   const SmallVectorImpl<SDValue> &OutVals,
    655                                   const SDLoc &DL, SelectionDAG &DAG) const {
    656   return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
    657 }
    658 
    659 //===---------------------------------------------------------------------===//
    660 // Target specific lowering
    661 //===---------------------------------------------------------------------===//
    662 
    663 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
    664                                         SmallVectorImpl<SDValue> &InVals) const {
    665   SDValue Callee = CLI.Callee;
    666   SelectionDAG &DAG = CLI.DAG;
    667 
    668   const Function &Fn = *DAG.getMachineFunction().getFunction();
    669 
    670   StringRef FuncName("<unknown>");
    671 
    672   if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
    673     FuncName = G->getSymbol();
    674   else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
    675     FuncName = G->getGlobal()->getName();
    676 
    677   DiagnosticInfoUnsupported NoCalls(
    678       Fn, "unsupported call to function " + FuncName, CLI.DL.getDebugLoc());
    679   DAG.getContext()->diagnose(NoCalls);
    680 
    681   for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
    682     InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
    683 
    684   return DAG.getEntryNode();
    685 }
    686 
    687 SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
    688                                                       SelectionDAG &DAG) const {
    689   const Function &Fn = *DAG.getMachineFunction().getFunction();
    690 
    691   DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
    692                                             SDLoc(Op).getDebugLoc());
    693   DAG.getContext()->diagnose(NoDynamicAlloca);
    694   auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
    695   return DAG.getMergeValues(Ops, SDLoc());
    696 }
    697 
    698 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
    699                                              SelectionDAG &DAG) const {
    700   switch (Op.getOpcode()) {
    701   default:
    702     Op->dump(&DAG);
    703     llvm_unreachable("Custom lowering code for this"
    704                      "instruction is not implemented yet!");
    705     break;
    706   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
    707   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
    708   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
    709   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
    710   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
    711   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
    712   case ISD::FREM: return LowerFREM(Op, DAG);
    713   case ISD::FCEIL: return LowerFCEIL(Op, DAG);
    714   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
    715   case ISD::FRINT: return LowerFRINT(Op, DAG);
    716   case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
    717   case ISD::FROUND: return LowerFROUND(Op, DAG);
    718   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
    719   case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
    720   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
    721   case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
    722   case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
    723   case ISD::CTLZ:
    724   case ISD::CTLZ_ZERO_UNDEF:
    725     return LowerCTLZ(Op, DAG);
    726   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
    727   }
    728   return Op;
    729 }
    730 
    731 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
    732                                               SmallVectorImpl<SDValue> &Results,
    733                                               SelectionDAG &DAG) const {
    734   switch (N->getOpcode()) {
    735   case ISD::SIGN_EXTEND_INREG:
    736     // Different parts of legalization seem to interpret which type of
    737     // sign_extend_inreg is the one to check for custom lowering. The extended
    738     // from type is what really matters, but some places check for custom
    739     // lowering of the result type. This results in trying to use
    740     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
    741     // nothing here and let the illegal result integer be handled normally.
    742     return;
    743   default:
    744     return;
    745   }
    746 }
    747 
    748 // FIXME: This implements accesses to initialized globals in the constant
    749 // address space by copying them to private and accessing that. It does not
    750 // properly handle illegal types or vectors. The private vector loads are not
    751 // scalarized, and the illegal scalars hit an assertion. This technique will not
    752 // work well with large initializers, and this should eventually be
    753 // removed. Initialized globals should be placed into a data section that the
    754 // runtime will load into a buffer before the kernel is executed. Uses of the
    755 // global need to be replaced with a pointer loaded from an implicit kernel
    756 // argument into this buffer holding the copy of the data, which will remove the
    757 // need for any of this.
    758 SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
    759                                                        const GlobalValue *GV,
    760                                                        const SDValue &InitPtr,
    761                                                        SDValue Chain,
    762                                                        SelectionDAG &DAG) const {
    763   const DataLayout &TD = DAG.getDataLayout();
    764   SDLoc DL(InitPtr);
    765   Type *InitTy = Init->getType();
    766 
    767   if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) {
    768     EVT VT = EVT::getEVT(InitTy);
    769     PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
    770     return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr,
    771                         MachinePointerInfo(UndefValue::get(PtrTy)), false,
    772                         false, TD.getPrefTypeAlignment(InitTy));
    773   }
    774 
    775   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
    776     EVT VT = EVT::getEVT(CFP->getType());
    777     PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
    778     return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr,
    779                         MachinePointerInfo(UndefValue::get(PtrTy)), false,
    780                         false, TD.getPrefTypeAlignment(CFP->getType()));
    781   }
    782 
    783   if (StructType *ST = dyn_cast<StructType>(InitTy)) {
    784     const StructLayout *SL = TD.getStructLayout(ST);
    785 
    786     EVT PtrVT = InitPtr.getValueType();
    787     SmallVector<SDValue, 8> Chains;
    788 
    789     for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) {
    790       SDValue Offset = DAG.getConstant(SL->getElementOffset(I), DL, PtrVT);
    791       SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
    792 
    793       Constant *Elt = Init->getAggregateElement(I);
    794       Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
    795     }
    796 
    797     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
    798   }
    799 
    800   if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) {
    801     EVT PtrVT = InitPtr.getValueType();
    802 
    803     unsigned NumElements;
    804     if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy))
    805       NumElements = AT->getNumElements();
    806     else if (VectorType *VT = dyn_cast<VectorType>(SeqTy))
    807       NumElements = VT->getNumElements();
    808     else
    809       llvm_unreachable("Unexpected type");
    810 
    811     unsigned EltSize = TD.getTypeAllocSize(SeqTy->getElementType());
    812     SmallVector<SDValue, 8> Chains;
    813     for (unsigned i = 0; i < NumElements; ++i) {
    814       SDValue Offset = DAG.getConstant(i * EltSize, DL, PtrVT);
    815       SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
    816 
    817       Constant *Elt = Init->getAggregateElement(i);
    818       Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
    819     }
    820 
    821     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
    822   }
    823 
    824   if (isa<UndefValue>(Init)) {
    825     EVT VT = EVT::getEVT(InitTy);
    826     PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
    827     return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr,
    828                         MachinePointerInfo(UndefValue::get(PtrTy)), false,
    829                         false, TD.getPrefTypeAlignment(InitTy));
    830   }
    831 
    832   Init->dump();
    833   llvm_unreachable("Unhandled constant initializer");
    834 }
    835 
    836 static bool hasDefinedInitializer(const GlobalValue *GV) {
    837   const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
    838   if (!GVar || !GVar->hasInitializer())
    839     return false;
    840 
    841   return !isa<UndefValue>(GVar->getInitializer());
    842 }
    843 
    844 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
    845                                                  SDValue Op,
    846                                                  SelectionDAG &DAG) const {
    847 
    848   const DataLayout &DL = DAG.getDataLayout();
    849   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
    850   const GlobalValue *GV = G->getGlobal();
    851 
    852   switch (G->getAddressSpace()) {
    853   case AMDGPUAS::CONSTANT_ADDRESS: {
    854     MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
    855     SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(G), ConstPtrVT);
    856     return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(G), ConstPtrVT, GA);
    857   }
    858   case AMDGPUAS::LOCAL_ADDRESS: {
    859     // XXX: What does the value of G->getOffset() mean?
    860     assert(G->getOffset() == 0 &&
    861          "Do not know what to do with an non-zero offset");
    862 
    863     // TODO: We could emit code to handle the initialization somewhere.
    864     if (hasDefinedInitializer(GV))
    865       break;
    866 
    867     unsigned Offset;
    868     if (MFI->LocalMemoryObjects.count(GV) == 0) {
    869       unsigned Align = GV->getAlignment();
    870       if (Align == 0)
    871         Align = DL.getABITypeAlignment(GV->getValueType());
    872 
    873       /// TODO: We should sort these to minimize wasted space due to alignment
    874       /// padding. Currently the padding is decided by the first encountered use
    875       /// during lowering.
    876       Offset = MFI->LDSSize = alignTo(MFI->LDSSize, Align);
    877       MFI->LocalMemoryObjects[GV] = Offset;
    878       MFI->LDSSize += DL.getTypeAllocSize(GV->getValueType());
    879     } else {
    880       Offset = MFI->LocalMemoryObjects[GV];
    881     }
    882 
    883     return DAG.getConstant(Offset, SDLoc(Op),
    884                            getPointerTy(DL, AMDGPUAS::LOCAL_ADDRESS));
    885   }
    886   }
    887 
    888   const Function &Fn = *DAG.getMachineFunction().getFunction();
    889   DiagnosticInfoUnsupported BadInit(
    890       Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
    891   DAG.getContext()->diagnose(BadInit);
    892   return SDValue();
    893 }
    894 
    895 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
    896                                                   SelectionDAG &DAG) const {
    897   SmallVector<SDValue, 8> Args;
    898 
    899   for (const SDUse &U : Op->ops())
    900     DAG.ExtractVectorElements(U.get(), Args);
    901 
    902   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
    903 }
    904 
    905 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
    906                                                      SelectionDAG &DAG) const {
    907 
    908   SmallVector<SDValue, 8> Args;
    909   unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    910   EVT VT = Op.getValueType();
    911   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
    912                             VT.getVectorNumElements());
    913 
    914   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
    915 }
    916 
    917 SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
    918     SelectionDAG &DAG) const {
    919   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    920   SDLoc DL(Op);
    921   EVT VT = Op.getValueType();
    922 
    923   switch (IntrinsicID) {
    924     default: return Op;
    925     case AMDGPUIntrinsic::AMDGPU_clamp: // Legacy name.
    926       return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
    927                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
    928 
    929     case Intrinsic::AMDGPU_ldexp: // Legacy name
    930       return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1),
    931                                                    Op.getOperand(2));
    932 
    933     case AMDGPUIntrinsic::AMDGPU_bfe_i32:
    934       return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
    935                          Op.getOperand(1),
    936                          Op.getOperand(2),
    937                          Op.getOperand(3));
    938 
    939     case AMDGPUIntrinsic::AMDGPU_bfe_u32:
    940       return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
    941                          Op.getOperand(1),
    942                          Op.getOperand(2),
    943                          Op.getOperand(3));
    944 
    945     case AMDGPUIntrinsic::AMDGPU_brev: // Legacy name
    946       return DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(1));
    947   }
    948 }
    949 
    950 /// \brief Generate Min/Max node
    951 SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(const SDLoc &DL, EVT VT,
    952                                                    SDValue LHS, SDValue RHS,
    953                                                    SDValue True, SDValue False,
    954                                                    SDValue CC,
    955                                                    DAGCombinerInfo &DCI) const {
    956   if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
    957     return SDValue();
    958 
    959   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
    960     return SDValue();
    961 
    962   SelectionDAG &DAG = DCI.DAG;
    963   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
    964   switch (CCOpcode) {
    965   case ISD::SETOEQ:
    966   case ISD::SETONE:
    967   case ISD::SETUNE:
    968   case ISD::SETNE:
    969   case ISD::SETUEQ:
    970   case ISD::SETEQ:
    971   case ISD::SETFALSE:
    972   case ISD::SETFALSE2:
    973   case ISD::SETTRUE:
    974   case ISD::SETTRUE2:
    975   case ISD::SETUO:
    976   case ISD::SETO:
    977     break;
    978   case ISD::SETULE:
    979   case ISD::SETULT: {
    980     if (LHS == True)
    981       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
    982     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
    983   }
    984   case ISD::SETOLE:
    985   case ISD::SETOLT:
    986   case ISD::SETLE:
    987   case ISD::SETLT: {
    988     // Ordered. Assume ordered for undefined.
    989 
    990     // Only do this after legalization to avoid interfering with other combines
    991     // which might occur.
    992     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
    993         !DCI.isCalledByLegalizer())
    994       return SDValue();
    995 
    996     // We need to permute the operands to get the correct NaN behavior. The
    997     // selected operand is the second one based on the failing compare with NaN,
    998     // so permute it based on the compare type the hardware uses.
    999     if (LHS == True)
   1000       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
   1001     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
   1002   }
   1003   case ISD::SETUGE:
   1004   case ISD::SETUGT: {
   1005     if (LHS == True)
   1006       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
   1007     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
   1008   }
   1009   case ISD::SETGT:
   1010   case ISD::SETGE:
   1011   case ISD::SETOGE:
   1012   case ISD::SETOGT: {
   1013     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
   1014         !DCI.isCalledByLegalizer())
   1015       return SDValue();
   1016 
   1017     if (LHS == True)
   1018       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
   1019     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
   1020   }
   1021   case ISD::SETCC_INVALID:
   1022     llvm_unreachable("Invalid setcc condcode!");
   1023   }
   1024   return SDValue();
   1025 }
   1026 
   1027 std::pair<SDValue, SDValue>
   1028 AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
   1029   SDLoc SL(Op);
   1030 
   1031   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
   1032 
   1033   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
   1034   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
   1035 
   1036   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
   1037   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
   1038 
   1039   return std::make_pair(Lo, Hi);
   1040 }
   1041 
   1042 SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
   1043   SDLoc SL(Op);
   1044 
   1045   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
   1046   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
   1047   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
   1048 }
   1049 
   1050 SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
   1051   SDLoc SL(Op);
   1052 
   1053   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
   1054   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
   1055   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
   1056 }
   1057 
   1058 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
   1059                                               SelectionDAG &DAG) const {
   1060   LoadSDNode *Load = cast<LoadSDNode>(Op);
   1061   EVT VT = Op.getValueType();
   1062 
   1063 
   1064   // If this is a 2 element vector, we really want to scalarize and not create
   1065   // weird 1 element vectors.
   1066   if (VT.getVectorNumElements() == 2)
   1067     return scalarizeVectorLoad(Load, DAG);
   1068 
   1069   SDValue BasePtr = Load->getBasePtr();
   1070   EVT PtrVT = BasePtr.getValueType();
   1071   EVT MemVT = Load->getMemoryVT();
   1072   SDLoc SL(Op);
   1073 
   1074   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
   1075 
   1076   EVT LoVT, HiVT;
   1077   EVT LoMemVT, HiMemVT;
   1078   SDValue Lo, Hi;
   1079 
   1080   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
   1081   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
   1082   std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
   1083 
   1084   unsigned Size = LoMemVT.getStoreSize();
   1085   unsigned BaseAlign = Load->getAlignment();
   1086   unsigned HiAlign = MinAlign(BaseAlign, Size);
   1087 
   1088   SDValue LoLoad
   1089     = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
   1090                      Load->getChain(), BasePtr,
   1091                      SrcValue,
   1092                      LoMemVT, Load->isVolatile(), Load->isNonTemporal(),
   1093                      Load->isInvariant(), BaseAlign);
   1094 
   1095   SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
   1096                               DAG.getConstant(Size, SL, PtrVT));
   1097 
   1098   SDValue HiLoad
   1099     = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT,
   1100                      Load->getChain(), HiPtr,
   1101                      SrcValue.getWithOffset(LoMemVT.getStoreSize()),
   1102                      HiMemVT, Load->isVolatile(), Load->isNonTemporal(),
   1103                      Load->isInvariant(), HiAlign);
   1104 
   1105   SDValue Ops[] = {
   1106     DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
   1107     DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
   1108                 LoLoad.getValue(1), HiLoad.getValue(1))
   1109   };
   1110 
   1111   return DAG.getMergeValues(Ops, SL);
   1112 }
   1113 
   1114 // FIXME: This isn't doing anything for SI. This should be used in a target
   1115 // combine during type legalization.
   1116 SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
   1117                                                SelectionDAG &DAG) const {
   1118   StoreSDNode *Store = cast<StoreSDNode>(Op);
   1119   EVT MemVT = Store->getMemoryVT();
   1120   unsigned MemBits = MemVT.getSizeInBits();
   1121 
   1122   // Byte stores are really expensive, so if possible, try to pack 32-bit vector
   1123   // truncating store into an i32 store.
   1124   // XXX: We could also handle optimize other vector bitwidths.
   1125   if (!MemVT.isVector() || MemBits > 32) {
   1126     return SDValue();
   1127   }
   1128 
   1129   SDLoc DL(Op);
   1130   SDValue Value = Store->getValue();
   1131   EVT VT = Value.getValueType();
   1132   EVT ElemVT = VT.getVectorElementType();
   1133   SDValue Ptr = Store->getBasePtr();
   1134   EVT MemEltVT = MemVT.getVectorElementType();
   1135   unsigned MemEltBits = MemEltVT.getSizeInBits();
   1136   unsigned MemNumElements = MemVT.getVectorNumElements();
   1137   unsigned PackedSize = MemVT.getStoreSizeInBits();
   1138   SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, DL, MVT::i32);
   1139 
   1140   assert(Value.getValueType().getScalarSizeInBits() >= 32);
   1141 
   1142   SDValue PackedValue;
   1143   for (unsigned i = 0; i < MemNumElements; ++i) {
   1144     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value,
   1145                               DAG.getConstant(i, DL, MVT::i32));
   1146     Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32);
   1147     Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg
   1148 
   1149     SDValue Shift = DAG.getConstant(MemEltBits * i, DL, MVT::i32);
   1150     Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift);
   1151 
   1152     if (i == 0) {
   1153       PackedValue = Elt;
   1154     } else {
   1155       PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt);
   1156     }
   1157   }
   1158 
   1159   if (PackedSize < 32) {
   1160     EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize);
   1161     return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr,
   1162                              Store->getMemOperand()->getPointerInfo(),
   1163                              PackedVT,
   1164                              Store->isNonTemporal(), Store->isVolatile(),
   1165                              Store->getAlignment());
   1166   }
   1167 
   1168   return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
   1169                       Store->getMemOperand()->getPointerInfo(),
   1170                       Store->isVolatile(),  Store->isNonTemporal(),
   1171                       Store->getAlignment());
   1172 }
   1173 
   1174 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
   1175                                                SelectionDAG &DAG) const {
   1176   StoreSDNode *Store = cast<StoreSDNode>(Op);
   1177   SDValue Val = Store->getValue();
   1178   EVT VT = Val.getValueType();
   1179 
   1180   // If this is a 2 element vector, we really want to scalarize and not create
   1181   // weird 1 element vectors.
   1182   if (VT.getVectorNumElements() == 2)
   1183     return scalarizeVectorStore(Store, DAG);
   1184 
   1185   EVT MemVT = Store->getMemoryVT();
   1186   SDValue Chain = Store->getChain();
   1187   SDValue BasePtr = Store->getBasePtr();
   1188   SDLoc SL(Op);
   1189 
   1190   EVT LoVT, HiVT;
   1191   EVT LoMemVT, HiMemVT;
   1192   SDValue Lo, Hi;
   1193 
   1194   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
   1195   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
   1196   std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
   1197 
   1198   EVT PtrVT = BasePtr.getValueType();
   1199   SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
   1200                               DAG.getConstant(LoMemVT.getStoreSize(), SL,
   1201                                               PtrVT));
   1202 
   1203   const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
   1204   unsigned BaseAlign = Store->getAlignment();
   1205   unsigned Size = LoMemVT.getStoreSize();
   1206   unsigned HiAlign = MinAlign(BaseAlign, Size);
   1207 
   1208   SDValue LoStore
   1209     = DAG.getTruncStore(Chain, SL, Lo,
   1210                         BasePtr,
   1211                         SrcValue,
   1212                         LoMemVT,
   1213                         Store->isNonTemporal(),
   1214                         Store->isVolatile(),
   1215                         BaseAlign);
   1216   SDValue HiStore
   1217     = DAG.getTruncStore(Chain, SL, Hi,
   1218                         HiPtr,
   1219                         SrcValue.getWithOffset(Size),
   1220                         HiMemVT,
   1221                         Store->isNonTemporal(),
   1222                         Store->isVolatile(),
   1223                         HiAlign);
   1224 
   1225   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
   1226 }
   1227 
   1228 // This is a shortcut for integer division because we have fast i32<->f32
   1229 // conversions, and fast f32 reciprocal instructions. The fractional part of a
   1230 // float is enough to accurately represent up to a 24-bit signed integer.
   1231 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
   1232                                             bool Sign) const {
   1233   SDLoc DL(Op);
   1234   EVT VT = Op.getValueType();
   1235   SDValue LHS = Op.getOperand(0);
   1236   SDValue RHS = Op.getOperand(1);
   1237   MVT IntVT = MVT::i32;
   1238   MVT FltVT = MVT::f32;
   1239 
   1240   unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
   1241   if (LHSSignBits < 9)
   1242     return SDValue();
   1243 
   1244   unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
   1245   if (RHSSignBits < 9)
   1246     return SDValue();
   1247 
   1248   unsigned BitSize = VT.getSizeInBits();
   1249   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
   1250   unsigned DivBits = BitSize - SignBits;
   1251   if (Sign)
   1252     ++DivBits;
   1253 
   1254   ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
   1255   ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
   1256 
   1257   SDValue jq = DAG.getConstant(1, DL, IntVT);
   1258 
   1259   if (Sign) {
   1260     // char|short jq = ia ^ ib;
   1261     jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
   1262 
   1263     // jq = jq >> (bitsize - 2)
   1264     jq = DAG.getNode(ISD::SRA, DL, VT, jq,
   1265                      DAG.getConstant(BitSize - 2, DL, VT));
   1266 
   1267     // jq = jq | 0x1
   1268     jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
   1269   }
   1270 
   1271   // int ia = (int)LHS;
   1272   SDValue ia = LHS;
   1273 
   1274   // int ib, (int)RHS;
   1275   SDValue ib = RHS;
   1276 
   1277   // float fa = (float)ia;
   1278   SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
   1279 
   1280   // float fb = (float)ib;
   1281   SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
   1282 
   1283   SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
   1284                            fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
   1285 
   1286   // fq = trunc(fq);
   1287   fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
   1288 
   1289   // float fqneg = -fq;
   1290   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
   1291 
   1292   // float fr = mad(fqneg, fb, fa);
   1293   SDValue fr = DAG.getNode(ISD::FMAD, DL, FltVT, fqneg, fb, fa);
   1294 
   1295   // int iq = (int)fq;
   1296   SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
   1297 
   1298   // fr = fabs(fr);
   1299   fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
   1300 
   1301   // fb = fabs(fb);
   1302   fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
   1303 
   1304   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   1305 
   1306   // int cv = fr >= fb;
   1307   SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
   1308 
   1309   // jq = (cv ? jq : 0);
   1310   jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
   1311 
   1312   // dst = iq + jq;
   1313   SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
   1314 
   1315   // Rem needs compensation, it's easier to recompute it
   1316   SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
   1317   Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
   1318 
   1319   // Truncate to number of bits this divide really is.
   1320   if (Sign) {
   1321     SDValue InRegSize
   1322       = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
   1323     Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
   1324     Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
   1325   } else {
   1326     SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
   1327     Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
   1328     Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
   1329   }
   1330 
   1331   return DAG.getMergeValues({ Div, Rem }, DL);
   1332 }
   1333 
   1334 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
   1335                                       SelectionDAG &DAG,
   1336                                       SmallVectorImpl<SDValue> &Results) const {
   1337   assert(Op.getValueType() == MVT::i64);
   1338 
   1339   SDLoc DL(Op);
   1340   EVT VT = Op.getValueType();
   1341   EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
   1342 
   1343   SDValue one = DAG.getConstant(1, DL, HalfVT);
   1344   SDValue zero = DAG.getConstant(0, DL, HalfVT);
   1345 
   1346   //HiLo split
   1347   SDValue LHS = Op.getOperand(0);
   1348   SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
   1349   SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
   1350 
   1351   SDValue RHS = Op.getOperand(1);
   1352   SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
   1353   SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
   1354 
   1355   if (VT == MVT::i64 &&
   1356     DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
   1357     DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
   1358 
   1359     SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
   1360                               LHS_Lo, RHS_Lo);
   1361 
   1362     SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), zero});
   1363     SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), zero});
   1364 
   1365     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
   1366     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
   1367     return;
   1368   }
   1369 
   1370   // Get Speculative values
   1371   SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
   1372   SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
   1373 
   1374   SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
   1375   SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, zero});
   1376   REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
   1377 
   1378   SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
   1379   SDValue DIV_Lo = zero;
   1380 
   1381   const unsigned halfBitWidth = HalfVT.getSizeInBits();
   1382 
   1383   for (unsigned i = 0; i < halfBitWidth; ++i) {
   1384     const unsigned bitPos = halfBitWidth - i - 1;
   1385     SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
   1386     // Get value of high bit
   1387     SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
   1388     HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
   1389     HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
   1390 
   1391     // Shift
   1392     REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
   1393     // Add LHS high bit
   1394     REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
   1395 
   1396     SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
   1397     SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE);
   1398 
   1399     DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
   1400 
   1401     // Update REM
   1402     SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
   1403     REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
   1404   }
   1405 
   1406   SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
   1407   DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
   1408   Results.push_back(DIV);
   1409   Results.push_back(REM);
   1410 }
   1411 
   1412 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
   1413                                            SelectionDAG &DAG) const {
   1414   SDLoc DL(Op);
   1415   EVT VT = Op.getValueType();
   1416 
   1417   if (VT == MVT::i64) {
   1418     SmallVector<SDValue, 2> Results;
   1419     LowerUDIVREM64(Op, DAG, Results);
   1420     return DAG.getMergeValues(Results, DL);
   1421   }
   1422 
   1423   if (VT == MVT::i32) {
   1424     if (SDValue Res = LowerDIVREM24(Op, DAG, false))
   1425       return Res;
   1426   }
   1427 
   1428   SDValue Num = Op.getOperand(0);
   1429   SDValue Den = Op.getOperand(1);
   1430 
   1431   // RCP =  URECIP(Den) = 2^32 / Den + e
   1432   // e is rounding error.
   1433   SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
   1434 
   1435   // RCP_LO = mul(RCP, Den) */
   1436   SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
   1437 
   1438   // RCP_HI = mulhu (RCP, Den) */
   1439   SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
   1440 
   1441   // NEG_RCP_LO = -RCP_LO
   1442   SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
   1443                                                      RCP_LO);
   1444 
   1445   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
   1446   SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
   1447                                            NEG_RCP_LO, RCP_LO,
   1448                                            ISD::SETEQ);
   1449   // Calculate the rounding error from the URECIP instruction
   1450   // E = mulhu(ABS_RCP_LO, RCP)
   1451   SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
   1452 
   1453   // RCP_A_E = RCP + E
   1454   SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
   1455 
   1456   // RCP_S_E = RCP - E
   1457   SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
   1458 
   1459   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
   1460   SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
   1461                                      RCP_A_E, RCP_S_E,
   1462                                      ISD::SETEQ);
   1463   // Quotient = mulhu(Tmp0, Num)
   1464   SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
   1465 
   1466   // Num_S_Remainder = Quotient * Den
   1467   SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
   1468 
   1469   // Remainder = Num - Num_S_Remainder
   1470   SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
   1471 
   1472   // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
   1473   SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
   1474                                                  DAG.getConstant(-1, DL, VT),
   1475                                                  DAG.getConstant(0, DL, VT),
   1476                                                  ISD::SETUGE);
   1477   // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
   1478   SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
   1479                                                   Num_S_Remainder,
   1480                                                   DAG.getConstant(-1, DL, VT),
   1481                                                   DAG.getConstant(0, DL, VT),
   1482                                                   ISD::SETUGE);
   1483   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
   1484   SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
   1485                                                Remainder_GE_Zero);
   1486 
   1487   // Calculate Division result:
   1488 
   1489   // Quotient_A_One = Quotient + 1
   1490   SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
   1491                                        DAG.getConstant(1, DL, VT));
   1492 
   1493   // Quotient_S_One = Quotient - 1
   1494   SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
   1495                                        DAG.getConstant(1, DL, VT));
   1496 
   1497   // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
   1498   SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
   1499                                      Quotient, Quotient_A_One, ISD::SETEQ);
   1500 
   1501   // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
   1502   Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
   1503                             Quotient_S_One, Div, ISD::SETEQ);
   1504 
   1505   // Calculate Rem result:
   1506 
   1507   // Remainder_S_Den = Remainder - Den
   1508   SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
   1509 
   1510   // Remainder_A_Den = Remainder + Den
   1511   SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
   1512 
   1513   // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
   1514   SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
   1515                                     Remainder, Remainder_S_Den, ISD::SETEQ);
   1516 
   1517   // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
   1518   Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
   1519                             Remainder_A_Den, Rem, ISD::SETEQ);
   1520   SDValue Ops[2] = {
   1521     Div,
   1522     Rem
   1523   };
   1524   return DAG.getMergeValues(Ops, DL);
   1525 }
   1526 
   1527 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
   1528                                            SelectionDAG &DAG) const {
   1529   SDLoc DL(Op);
   1530   EVT VT = Op.getValueType();
   1531 
   1532   SDValue LHS = Op.getOperand(0);
   1533   SDValue RHS = Op.getOperand(1);
   1534 
   1535   SDValue Zero = DAG.getConstant(0, DL, VT);
   1536   SDValue NegOne = DAG.getConstant(-1, DL, VT);
   1537 
   1538   if (VT == MVT::i32) {
   1539     if (SDValue Res = LowerDIVREM24(Op, DAG, true))
   1540       return Res;
   1541   }
   1542 
   1543   if (VT == MVT::i64 &&
   1544       DAG.ComputeNumSignBits(LHS) > 32 &&
   1545       DAG.ComputeNumSignBits(RHS) > 32) {
   1546     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
   1547 
   1548     //HiLo split
   1549     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
   1550     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
   1551     SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
   1552                                  LHS_Lo, RHS_Lo);
   1553     SDValue Res[2] = {
   1554       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
   1555       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
   1556     };
   1557     return DAG.getMergeValues(Res, DL);
   1558   }
   1559 
   1560   SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
   1561   SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
   1562   SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
   1563   SDValue RSign = LHSign; // Remainder sign is the same as LHS
   1564 
   1565   LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
   1566   RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
   1567 
   1568   LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
   1569   RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
   1570 
   1571   SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
   1572   SDValue Rem = Div.getValue(1);
   1573 
   1574   Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
   1575   Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
   1576 
   1577   Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
   1578   Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
   1579 
   1580   SDValue Res[2] = {
   1581     Div,
   1582     Rem
   1583   };
   1584   return DAG.getMergeValues(Res, DL);
   1585 }
   1586 
   1587 // (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
   1588 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
   1589   SDLoc SL(Op);
   1590   EVT VT = Op.getValueType();
   1591   SDValue X = Op.getOperand(0);
   1592   SDValue Y = Op.getOperand(1);
   1593 
   1594   // TODO: Should this propagate fast-math-flags?
   1595 
   1596   SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
   1597   SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
   1598   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
   1599 
   1600   return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
   1601 }
   1602 
   1603 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
   1604   SDLoc SL(Op);
   1605   SDValue Src = Op.getOperand(0);
   1606 
   1607   // result = trunc(src)
   1608   // if (src > 0.0 && src != result)
   1609   //   result += 1.0
   1610 
   1611   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
   1612 
   1613   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
   1614   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
   1615 
   1616   EVT SetCCVT =
   1617       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
   1618 
   1619   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
   1620   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
   1621   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
   1622 
   1623   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
   1624   // TODO: Should this propagate fast-math-flags?
   1625   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
   1626 }
   1627 
   1628 static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
   1629                                   SelectionDAG &DAG) {
   1630   const unsigned FractBits = 52;
   1631   const unsigned ExpBits = 11;
   1632 
   1633   SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
   1634                                 Hi,
   1635                                 DAG.getConstant(FractBits - 32, SL, MVT::i32),
   1636                                 DAG.getConstant(ExpBits, SL, MVT::i32));
   1637   SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
   1638                             DAG.getConstant(1023, SL, MVT::i32));
   1639 
   1640   return Exp;
   1641 }
   1642 
   1643 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
   1644   SDLoc SL(Op);
   1645   SDValue Src = Op.getOperand(0);
   1646 
   1647   assert(Op.getValueType() == MVT::f64);
   1648 
   1649   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
   1650   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
   1651 
   1652   SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
   1653 
   1654   // Extract the upper half, since this is where we will find the sign and
   1655   // exponent.
   1656   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
   1657 
   1658   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
   1659 
   1660   const unsigned FractBits = 52;
   1661 
   1662   // Extract the sign bit.
   1663   const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
   1664   SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
   1665 
   1666   // Extend back to to 64-bits.
   1667   SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
   1668   SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
   1669 
   1670   SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
   1671   const SDValue FractMask
   1672     = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
   1673 
   1674   SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
   1675   SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
   1676   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
   1677 
   1678   EVT SetCCVT =
   1679       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
   1680 
   1681   const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
   1682 
   1683   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
   1684   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
   1685 
   1686   SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
   1687   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
   1688 
   1689   return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
   1690 }
   1691 
   1692 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
   1693   SDLoc SL(Op);
   1694   SDValue Src = Op.getOperand(0);
   1695 
   1696   assert(Op.getValueType() == MVT::f64);
   1697 
   1698   APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52");
   1699   SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
   1700   SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
   1701 
   1702   // TODO: Should this propagate fast-math-flags?
   1703 
   1704   SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
   1705   SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
   1706 
   1707   SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
   1708 
   1709   APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51");
   1710   SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
   1711 
   1712   EVT SetCCVT =
   1713       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
   1714   SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
   1715 
   1716   return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
   1717 }
   1718 
   1719 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
   1720   // FNEARBYINT and FRINT are the same, except in their handling of FP
   1721   // exceptions. Those aren't really meaningful for us, and OpenCL only has
   1722   // rint, so just treat them as equivalent.
   1723   return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
   1724 }
   1725 
   1726 // XXX - May require not supporting f32 denormals?
   1727 SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const {
   1728   SDLoc SL(Op);
   1729   SDValue X = Op.getOperand(0);
   1730 
   1731   SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
   1732 
   1733   // TODO: Should this propagate fast-math-flags?
   1734 
   1735   SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
   1736 
   1737   SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
   1738 
   1739   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f32);
   1740   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
   1741   const SDValue Half = DAG.getConstantFP(0.5, SL, MVT::f32);
   1742 
   1743   SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X);
   1744 
   1745   EVT SetCCVT =
   1746       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
   1747 
   1748   SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
   1749 
   1750   SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero);
   1751 
   1752   return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel);
   1753 }
   1754 
   1755 SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
   1756   SDLoc SL(Op);
   1757   SDValue X = Op.getOperand(0);
   1758 
   1759   SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
   1760 
   1761   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
   1762   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
   1763   const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
   1764   const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
   1765   EVT SetCCVT =
   1766       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
   1767 
   1768   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
   1769 
   1770   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
   1771 
   1772   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
   1773 
   1774   const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL,
   1775                                        MVT::i64);
   1776 
   1777   SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
   1778   SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
   1779                           DAG.getConstant(INT64_C(0x0008000000000000), SL,
   1780                                           MVT::i64),
   1781                           Exp);
   1782 
   1783   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
   1784   SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
   1785                               DAG.getConstant(0, SL, MVT::i64), Tmp0,
   1786                               ISD::SETNE);
   1787 
   1788   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
   1789                              D, DAG.getConstant(0, SL, MVT::i64));
   1790   SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
   1791 
   1792   K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
   1793   K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
   1794 
   1795   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
   1796   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
   1797   SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
   1798 
   1799   SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
   1800                             ExpEqNegOne,
   1801                             DAG.getConstantFP(1.0, SL, MVT::f64),
   1802                             DAG.getConstantFP(0.0, SL, MVT::f64));
   1803 
   1804   SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
   1805 
   1806   K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
   1807   K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
   1808 
   1809   return K;
   1810 }
   1811 
   1812 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
   1813   EVT VT = Op.getValueType();
   1814 
   1815   if (VT == MVT::f32)
   1816     return LowerFROUND32(Op, DAG);
   1817 
   1818   if (VT == MVT::f64)
   1819     return LowerFROUND64(Op, DAG);
   1820 
   1821   llvm_unreachable("unhandled type");
   1822 }
   1823 
   1824 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
   1825   SDLoc SL(Op);
   1826   SDValue Src = Op.getOperand(0);
   1827 
   1828   // result = trunc(src);
   1829   // if (src < 0.0 && src != result)
   1830   //   result += -1.0.
   1831 
   1832   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
   1833 
   1834   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
   1835   const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
   1836 
   1837   EVT SetCCVT =
   1838       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
   1839 
   1840   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
   1841   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
   1842   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
   1843 
   1844   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
   1845   // TODO: Should this propagate fast-math-flags?
   1846   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
   1847 }
   1848 
   1849 SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
   1850   SDLoc SL(Op);
   1851   SDValue Src = Op.getOperand(0);
   1852   bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
   1853 
   1854   if (ZeroUndef && Src.getValueType() == MVT::i32)
   1855     return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src);
   1856 
   1857   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
   1858 
   1859   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
   1860   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
   1861 
   1862   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
   1863   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
   1864 
   1865   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
   1866                                    *DAG.getContext(), MVT::i32);
   1867 
   1868   SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ);
   1869 
   1870   SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo);
   1871   SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi);
   1872 
   1873   const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
   1874   SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32);
   1875 
   1876   // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
   1877   SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi);
   1878 
   1879   if (!ZeroUndef) {
   1880     // Test if the full 64-bit input is zero.
   1881 
   1882     // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
   1883     // which we probably don't want.
   1884     SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ);
   1885     SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0);
   1886 
   1887     // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
   1888     // with the same cycles, otherwise it is slower.
   1889     // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
   1890     // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
   1891 
   1892     const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
   1893 
   1894     // The instruction returns -1 for 0 input, but the defined intrinsic
   1895     // behavior is to return the number of bits.
   1896     NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32,
   1897                           SrcIsZero, Bits32, NewCtlz);
   1898   }
   1899 
   1900   return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz);
   1901 }
   1902 
   1903 SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
   1904                                                bool Signed) const {
   1905   // Unsigned
   1906   // cul2f(ulong u)
   1907   //{
   1908   //  uint lz = clz(u);
   1909   //  uint e = (u != 0) ? 127U + 63U - lz : 0;
   1910   //  u = (u << lz) & 0x7fffffffffffffffUL;
   1911   //  ulong t = u & 0xffffffffffUL;
   1912   //  uint v = (e << 23) | (uint)(u >> 40);
   1913   //  uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
   1914   //  return as_float(v + r);
   1915   //}
   1916   // Signed
   1917   // cl2f(long l)
   1918   //{
   1919   //  long s = l >> 63;
   1920   //  float r = cul2f((l + s) ^ s);
   1921   //  return s ? -r : r;
   1922   //}
   1923 
   1924   SDLoc SL(Op);
   1925   SDValue Src = Op.getOperand(0);
   1926   SDValue L = Src;
   1927 
   1928   SDValue S;
   1929   if (Signed) {
   1930     const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
   1931     S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
   1932 
   1933     SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
   1934     L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
   1935   }
   1936 
   1937   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
   1938                                    *DAG.getContext(), MVT::f32);
   1939 
   1940 
   1941   SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
   1942   SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
   1943   SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
   1944   LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
   1945 
   1946   SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
   1947   SDValue E = DAG.getSelect(SL, MVT::i32,
   1948     DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
   1949     DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
   1950     ZeroI32);
   1951 
   1952   SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
   1953     DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
   1954     DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
   1955 
   1956   SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
   1957                           DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
   1958 
   1959   SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
   1960                              U, DAG.getConstant(40, SL, MVT::i64));
   1961 
   1962   SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
   1963     DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
   1964     DAG.getNode(ISD::TRUNCATE, SL, MVT::i32,  UShl));
   1965 
   1966   SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
   1967   SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
   1968   SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
   1969 
   1970   SDValue One = DAG.getConstant(1, SL, MVT::i32);
   1971 
   1972   SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
   1973 
   1974   SDValue R = DAG.getSelect(SL, MVT::i32,
   1975     RCmp,
   1976     One,
   1977     DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
   1978   R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
   1979   R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
   1980 
   1981   if (!Signed)
   1982     return R;
   1983 
   1984   SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
   1985   return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
   1986 }
   1987 
   1988 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
   1989                                                bool Signed) const {
   1990   SDLoc SL(Op);
   1991   SDValue Src = Op.getOperand(0);
   1992 
   1993   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
   1994 
   1995   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
   1996                            DAG.getConstant(0, SL, MVT::i32));
   1997   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
   1998                            DAG.getConstant(1, SL, MVT::i32));
   1999 
   2000   SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
   2001                               SL, MVT::f64, Hi);
   2002 
   2003   SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
   2004 
   2005   SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
   2006                               DAG.getConstant(32, SL, MVT::i32));
   2007   // TODO: Should this propagate fast-math-flags?
   2008   return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
   2009 }
   2010 
   2011 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
   2012                                                SelectionDAG &DAG) const {
   2013   assert(Op.getOperand(0).getValueType() == MVT::i64 &&
   2014          "operation should be legal");
   2015 
   2016   EVT DestVT = Op.getValueType();
   2017   if (DestVT == MVT::f64)
   2018     return LowerINT_TO_FP64(Op, DAG, false);
   2019 
   2020   if (DestVT == MVT::f32)
   2021     return LowerINT_TO_FP32(Op, DAG, false);
   2022 
   2023   return SDValue();
   2024 }
   2025 
   2026 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
   2027                                               SelectionDAG &DAG) const {
   2028   assert(Op.getOperand(0).getValueType() == MVT::i64 &&
   2029          "operation should be legal");
   2030 
   2031   EVT DestVT = Op.getValueType();
   2032   if (DestVT == MVT::f32)
   2033     return LowerINT_TO_FP32(Op, DAG, true);
   2034 
   2035   if (DestVT == MVT::f64)
   2036     return LowerINT_TO_FP64(Op, DAG, true);
   2037 
   2038   return SDValue();
   2039 }
   2040 
   2041 SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
   2042                                                bool Signed) const {
   2043   SDLoc SL(Op);
   2044 
   2045   SDValue Src = Op.getOperand(0);
   2046 
   2047   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
   2048 
   2049   SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
   2050                                  MVT::f64);
   2051   SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
   2052                                  MVT::f64);
   2053   // TODO: Should this propagate fast-math-flags?
   2054   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
   2055 
   2056   SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
   2057 
   2058 
   2059   SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
   2060 
   2061   SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
   2062                            MVT::i32, FloorMul);
   2063   SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
   2064 
   2065   SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
   2066 
   2067   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
   2068 }
   2069 
   2070 SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
   2071                                               SelectionDAG &DAG) const {
   2072   SDValue Src = Op.getOperand(0);
   2073 
   2074   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
   2075     return LowerFP64_TO_INT(Op, DAG, true);
   2076 
   2077   return SDValue();
   2078 }
   2079 
   2080 SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
   2081                                               SelectionDAG &DAG) const {
   2082   SDValue Src = Op.getOperand(0);
   2083 
   2084   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
   2085     return LowerFP64_TO_INT(Op, DAG, false);
   2086 
   2087   return SDValue();
   2088 }
   2089 
   2090 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
   2091                                                      SelectionDAG &DAG) const {
   2092   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
   2093   MVT VT = Op.getSimpleValueType();
   2094   MVT ScalarVT = VT.getScalarType();
   2095 
   2096   if (!VT.isVector())
   2097     return SDValue();
   2098 
   2099   SDValue Src = Op.getOperand(0);
   2100   SDLoc DL(Op);
   2101 
   2102   // TODO: Don't scalarize on Evergreen?
   2103   unsigned NElts = VT.getVectorNumElements();
   2104   SmallVector<SDValue, 8> Args;
   2105   DAG.ExtractVectorElements(Src, Args, 0, NElts);
   2106 
   2107   SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
   2108   for (unsigned I = 0; I < NElts; ++I)
   2109     Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
   2110 
   2111   return DAG.getBuildVector(VT, DL, Args);
   2112 }
   2113 
   2114 //===----------------------------------------------------------------------===//
   2115 // Custom DAG optimizations
   2116 //===----------------------------------------------------------------------===//
   2117 
   2118 static bool isU24(SDValue Op, SelectionDAG &DAG) {
   2119   APInt KnownZero, KnownOne;
   2120   EVT VT = Op.getValueType();
   2121   DAG.computeKnownBits(Op, KnownZero, KnownOne);
   2122 
   2123   return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;
   2124 }
   2125 
   2126 static bool isI24(SDValue Op, SelectionDAG &DAG) {
   2127   EVT VT = Op.getValueType();
   2128 
   2129   // In order for this to be a signed 24-bit value, bit 23, must
   2130   // be a sign bit.
   2131   return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
   2132                                      // as unsigned 24-bit values.
   2133          (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
   2134 }
   2135 
   2136 static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) {
   2137 
   2138   SelectionDAG &DAG = DCI.DAG;
   2139   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   2140   EVT VT = Op.getValueType();
   2141 
   2142   APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
   2143   APInt KnownZero, KnownOne;
   2144   TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
   2145   if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
   2146     DCI.CommitTargetLoweringOpt(TLO);
   2147 }
   2148 
   2149 template <typename IntTy>
   2150 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
   2151                                uint32_t Width, const SDLoc &DL) {
   2152   if (Width + Offset < 32) {
   2153     uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
   2154     IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
   2155     return DAG.getConstant(Result, DL, MVT::i32);
   2156   }
   2157 
   2158   return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
   2159 }
   2160 
   2161 static bool hasVolatileUser(SDNode *Val) {
   2162   for (SDNode *U : Val->uses()) {
   2163     if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
   2164       if (M->isVolatile())
   2165         return true;
   2166     }
   2167   }
   2168 
   2169   return false;
   2170 }
   2171 
   2172 bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
   2173   // i32 vectors are the canonical memory type.
   2174   if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
   2175     return false;
   2176 
   2177   if (!VT.isByteSized())
   2178     return false;
   2179 
   2180   unsigned Size = VT.getStoreSize();
   2181 
   2182   if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
   2183     return false;
   2184 
   2185   if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
   2186     return false;
   2187 
   2188   return true;
   2189 }
   2190 
   2191 // Replace load of an illegal type with a store of a bitcast to a friendlier
   2192 // type.
   2193 SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
   2194                                                  DAGCombinerInfo &DCI) const {
   2195   if (!DCI.isBeforeLegalize())
   2196     return SDValue();
   2197 
   2198   LoadSDNode *LN = cast<LoadSDNode>(N);
   2199   if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
   2200     return SDValue();
   2201 
   2202   SDLoc SL(N);
   2203   SelectionDAG &DAG = DCI.DAG;
   2204   EVT VT = LN->getMemoryVT();
   2205 
   2206   unsigned Size = VT.getStoreSize();
   2207   unsigned Align = LN->getAlignment();
   2208   if (Align < Size && isTypeLegal(VT)) {
   2209     bool IsFast;
   2210     unsigned AS = LN->getAddressSpace();
   2211 
   2212     // Expand unaligned loads earlier than legalization. Due to visitation order
   2213     // problems during legalization, the emitted instructions to pack and unpack
   2214     // the bytes again are not eliminated in the case of an unaligned copy.
   2215     if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
   2216       SDValue Ops[2];
   2217       std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
   2218       return DAG.getMergeValues(Ops, SDLoc(N));
   2219     }
   2220 
   2221     if (!IsFast)
   2222       return SDValue();
   2223   }
   2224 
   2225   if (!shouldCombineMemoryType(VT))
   2226     return SDValue();
   2227 
   2228   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
   2229 
   2230   SDValue NewLoad
   2231     = DAG.getLoad(NewVT, SL, LN->getChain(),
   2232                   LN->getBasePtr(), LN->getMemOperand());
   2233 
   2234   SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
   2235   DCI.CombineTo(N, BC, NewLoad.getValue(1));
   2236   return SDValue(N, 0);
   2237 }
   2238 
   2239 // Replace store of an illegal type with a store of a bitcast to a friendlier
   2240 // type.
   2241 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
   2242                                                   DAGCombinerInfo &DCI) const {
   2243   if (!DCI.isBeforeLegalize())
   2244     return SDValue();
   2245 
   2246   StoreSDNode *SN = cast<StoreSDNode>(N);
   2247   if (SN->isVolatile() || !ISD::isNormalStore(SN))
   2248     return SDValue();
   2249 
   2250   EVT VT = SN->getMemoryVT();
   2251   unsigned Size = VT.getStoreSize();
   2252 
   2253   SDLoc SL(N);
   2254   SelectionDAG &DAG = DCI.DAG;
   2255   unsigned Align = SN->getAlignment();
   2256   if (Align < Size && isTypeLegal(VT)) {
   2257     bool IsFast;
   2258     unsigned AS = SN->getAddressSpace();
   2259 
   2260     // Expand unaligned stores earlier than legalization. Due to visitation
   2261     // order problems during legalization, the emitted instructions to pack and
   2262     // unpack the bytes again are not eliminated in the case of an unaligned
   2263     // copy.
   2264     if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast))
   2265       return expandUnalignedStore(SN, DAG);
   2266 
   2267     if (!IsFast)
   2268       return SDValue();
   2269   }
   2270 
   2271   if (!shouldCombineMemoryType(VT))
   2272     return SDValue();
   2273 
   2274   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
   2275   SDValue Val = SN->getValue();
   2276 
   2277   //DCI.AddToWorklist(Val.getNode());
   2278 
   2279   bool OtherUses = !Val.hasOneUse();
   2280   SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
   2281   if (OtherUses) {
   2282     SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
   2283     DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
   2284   }
   2285 
   2286   return DAG.getStore(SN->getChain(), SL, CastVal,
   2287                       SN->getBasePtr(), SN->getMemOperand());
   2288 }
   2289 
   2290 // TODO: Should repeat for other bit ops.
   2291 SDValue AMDGPUTargetLowering::performAndCombine(SDNode *N,
   2292                                                 DAGCombinerInfo &DCI) const {
   2293   if (N->getValueType(0) != MVT::i64)
   2294     return SDValue();
   2295 
   2296   // Break up 64-bit and of a constant into two 32-bit ands. This will typically
   2297   // happen anyway for a VALU 64-bit and. This exposes other 32-bit integer
   2298   // combine opportunities since most 64-bit operations are decomposed this way.
   2299   // TODO: We won't want this for SALU especially if it is an inline immediate.
   2300   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
   2301   if (!RHS)
   2302     return SDValue();
   2303 
   2304   uint64_t Val = RHS->getZExtValue();
   2305   if (Lo_32(Val) != 0 && Hi_32(Val) != 0 && !RHS->hasOneUse()) {
   2306     // If either half of the constant is 0, this is really a 32-bit and, so
   2307     // split it. If we can re-use the full materialized constant, keep it.
   2308     return SDValue();
   2309   }
   2310 
   2311   SDLoc SL(N);
   2312   SelectionDAG &DAG = DCI.DAG;
   2313 
   2314   SDValue Lo, Hi;
   2315   std::tie(Lo, Hi) = split64BitValue(N->getOperand(0), DAG);
   2316 
   2317   SDValue LoRHS = DAG.getConstant(Lo_32(Val), SL, MVT::i32);
   2318   SDValue HiRHS = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
   2319 
   2320   SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, LoRHS);
   2321   SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, HiRHS);
   2322 
   2323   // Re-visit the ands. It's possible we eliminated one of them and it could
   2324   // simplify the vector.
   2325   DCI.AddToWorklist(Lo.getNode());
   2326   DCI.AddToWorklist(Hi.getNode());
   2327 
   2328   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
   2329   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
   2330 }
   2331 
   2332 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
   2333                                                 DAGCombinerInfo &DCI) const {
   2334   if (N->getValueType(0) != MVT::i64)
   2335     return SDValue();
   2336 
   2337   // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
   2338 
   2339   // On some subtargets, 64-bit shift is a quarter rate instruction. In the
   2340   // common case, splitting this into a move and a 32-bit shift is faster and
   2341   // the same code size.
   2342   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
   2343   if (!RHS)
   2344     return SDValue();
   2345 
   2346   unsigned RHSVal = RHS->getZExtValue();
   2347   if (RHSVal < 32)
   2348     return SDValue();
   2349 
   2350   SDValue LHS = N->getOperand(0);
   2351 
   2352   SDLoc SL(N);
   2353   SelectionDAG &DAG = DCI.DAG;
   2354 
   2355   SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
   2356 
   2357   SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
   2358   SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
   2359 
   2360   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
   2361 
   2362   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
   2363   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
   2364 }
   2365 
   2366 SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
   2367                                                 DAGCombinerInfo &DCI) const {
   2368   if (N->getValueType(0) != MVT::i64)
   2369     return SDValue();
   2370 
   2371   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
   2372   if (!RHS)
   2373     return SDValue();
   2374 
   2375   SelectionDAG &DAG = DCI.DAG;
   2376   SDLoc SL(N);
   2377   unsigned RHSVal = RHS->getZExtValue();
   2378 
   2379   // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
   2380   if (RHSVal == 32) {
   2381     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
   2382     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
   2383                                    DAG.getConstant(31, SL, MVT::i32));
   2384 
   2385     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
   2386     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
   2387   }
   2388 
   2389   // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
   2390   if (RHSVal == 63) {
   2391     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
   2392     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
   2393                                    DAG.getConstant(31, SL, MVT::i32));
   2394     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
   2395     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
   2396   }
   2397 
   2398   return SDValue();
   2399 }
   2400 
   2401 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
   2402                                                 DAGCombinerInfo &DCI) const {
   2403   if (N->getValueType(0) != MVT::i64)
   2404     return SDValue();
   2405 
   2406   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
   2407   if (!RHS)
   2408     return SDValue();
   2409 
   2410   unsigned ShiftAmt = RHS->getZExtValue();
   2411   if (ShiftAmt < 32)
   2412     return SDValue();
   2413 
   2414   // srl i64:x, C for C >= 32
   2415   // =>
   2416   //   build_pair (srl hi_32(x), C - 32), 0
   2417 
   2418   SelectionDAG &DAG = DCI.DAG;
   2419   SDLoc SL(N);
   2420 
   2421   SDValue One = DAG.getConstant(1, SL, MVT::i32);
   2422   SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
   2423 
   2424   SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0));
   2425   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
   2426                            VecOp, One);
   2427 
   2428   SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
   2429   SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
   2430 
   2431   SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
   2432 
   2433   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
   2434 }
   2435 
   2436 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
   2437                                                 DAGCombinerInfo &DCI) const {
   2438   EVT VT = N->getValueType(0);
   2439 
   2440   if (VT.isVector() || VT.getSizeInBits() > 32)
   2441     return SDValue();
   2442 
   2443   SelectionDAG &DAG = DCI.DAG;
   2444   SDLoc DL(N);
   2445 
   2446   SDValue N0 = N->getOperand(0);
   2447   SDValue N1 = N->getOperand(1);
   2448   SDValue Mul;
   2449 
   2450   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
   2451     N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
   2452     N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
   2453     Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
   2454   } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
   2455     N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
   2456     N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
   2457     Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
   2458   } else {
   2459     return SDValue();
   2460   }
   2461 
   2462   // We need to use sext even for MUL_U24, because MUL_U24 is used
   2463   // for signed multiply of 8 and 16-bit types.
   2464   return DAG.getSExtOrTrunc(Mul, DL, VT);
   2465 }
   2466 
   2467 static bool isNegativeOne(SDValue Val) {
   2468   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
   2469     return C->isAllOnesValue();
   2470   return false;
   2471 }
   2472 
   2473 static bool isCtlzOpc(unsigned Opc) {
   2474   return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
   2475 }
   2476 
   2477 // Get FFBH node if the incoming op may have been type legalized from a smaller
   2478 // type VT.
   2479 // Need to match pre-legalized type because the generic legalization inserts the
   2480 // add/sub between the select and compare.
   2481 static SDValue getFFBH_U32(const TargetLowering &TLI, SelectionDAG &DAG,
   2482                            const SDLoc &SL, SDValue Op) {
   2483   EVT VT = Op.getValueType();
   2484   EVT LegalVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   2485   if (LegalVT != MVT::i32)
   2486     return SDValue();
   2487 
   2488   if (VT != MVT::i32)
   2489     Op = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Op);
   2490 
   2491   SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Op);
   2492   if (VT != MVT::i32)
   2493     FFBH = DAG.getNode(ISD::TRUNCATE, SL, VT, FFBH);
   2494 
   2495   return FFBH;
   2496 }
   2497 
   2498 // The native instructions return -1 on 0 input. Optimize out a select that
   2499 // produces -1 on 0.
   2500 //
   2501 // TODO: If zero is not undef, we could also do this if the output is compared
   2502 // against the bitwidth.
   2503 //
   2504 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
   2505 SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
   2506                                                  SDValue LHS, SDValue RHS,
   2507                                                  DAGCombinerInfo &DCI) const {
   2508   ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
   2509   if (!CmpRhs || !CmpRhs->isNullValue())
   2510     return SDValue();
   2511 
   2512   SelectionDAG &DAG = DCI.DAG;
   2513   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   2514   SDValue CmpLHS = Cond.getOperand(0);
   2515 
   2516   // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
   2517   if (CCOpcode == ISD::SETEQ &&
   2518       isCtlzOpc(RHS.getOpcode()) &&
   2519       RHS.getOperand(0) == CmpLHS &&
   2520       isNegativeOne(LHS)) {
   2521     return getFFBH_U32(*this, DAG, SL, CmpLHS);
   2522   }
   2523 
   2524   // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
   2525   if (CCOpcode == ISD::SETNE &&
   2526       isCtlzOpc(LHS.getOpcode()) &&
   2527       LHS.getOperand(0) == CmpLHS &&
   2528       isNegativeOne(RHS)) {
   2529     return getFFBH_U32(*this, DAG, SL, CmpLHS);
   2530   }
   2531 
   2532   return SDValue();
   2533 }
   2534 
   2535 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
   2536                                                    DAGCombinerInfo &DCI) const {
   2537   SDValue Cond = N->getOperand(0);
   2538   if (Cond.getOpcode() != ISD::SETCC)
   2539     return SDValue();
   2540 
   2541   EVT VT = N->getValueType(0);
   2542   SDValue LHS = Cond.getOperand(0);
   2543   SDValue RHS = Cond.getOperand(1);
   2544   SDValue CC = Cond.getOperand(2);
   2545 
   2546   SDValue True = N->getOperand(1);
   2547   SDValue False = N->getOperand(2);
   2548 
   2549   if (VT == MVT::f32 && Cond.hasOneUse()) {
   2550     SDValue MinMax
   2551       = CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
   2552     // Revisit this node so we can catch min3/max3/med3 patterns.
   2553     //DCI.AddToWorklist(MinMax.getNode());
   2554     return MinMax;
   2555   }
   2556 
   2557   // There's no reason to not do this if the condition has other uses.
   2558   return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
   2559 }
   2560 
   2561 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
   2562                                                 DAGCombinerInfo &DCI) const {
   2563   SelectionDAG &DAG = DCI.DAG;
   2564   SDLoc DL(N);
   2565 
   2566   switch(N->getOpcode()) {
   2567   default:
   2568     break;
   2569   case ISD::BITCAST: {
   2570     EVT DestVT = N->getValueType(0);
   2571     if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
   2572       break;
   2573 
   2574     // Fold bitcasts of constants.
   2575     //
   2576     // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
   2577     // TODO: Generalize and move to DAGCombiner
   2578     SDValue Src = N->getOperand(0);
   2579     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
   2580       assert(Src.getValueType() == MVT::i64);
   2581       SDLoc SL(N);
   2582       uint64_t CVal = C->getZExtValue();
   2583       return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
   2584                          DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
   2585                          DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
   2586     }
   2587 
   2588     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
   2589       const APInt &Val = C->getValueAPF().bitcastToAPInt();
   2590       SDLoc SL(N);
   2591       uint64_t CVal = Val.getZExtValue();
   2592       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
   2593                                 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
   2594                                 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
   2595 
   2596       return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
   2597     }
   2598 
   2599     break;
   2600   }
   2601   case ISD::SHL: {
   2602     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
   2603       break;
   2604 
   2605     return performShlCombine(N, DCI);
   2606   }
   2607   case ISD::SRL: {
   2608     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
   2609       break;
   2610 
   2611     return performSrlCombine(N, DCI);
   2612   }
   2613   case ISD::SRA: {
   2614     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
   2615       break;
   2616 
   2617     return performSraCombine(N, DCI);
   2618   }
   2619   case ISD::AND: {
   2620     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
   2621       break;
   2622 
   2623     return performAndCombine(N, DCI);
   2624   }
   2625   case ISD::MUL:
   2626     return performMulCombine(N, DCI);
   2627   case AMDGPUISD::MUL_I24:
   2628   case AMDGPUISD::MUL_U24: {
   2629     SDValue N0 = N->getOperand(0);
   2630     SDValue N1 = N->getOperand(1);
   2631     simplifyI24(N0, DCI);
   2632     simplifyI24(N1, DCI);
   2633     return SDValue();
   2634   }
   2635   case ISD::SELECT:
   2636     return performSelectCombine(N, DCI);
   2637   case AMDGPUISD::BFE_I32:
   2638   case AMDGPUISD::BFE_U32: {
   2639     assert(!N->getValueType(0).isVector() &&
   2640            "Vector handling of BFE not implemented");
   2641     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
   2642     if (!Width)
   2643       break;
   2644 
   2645     uint32_t WidthVal = Width->getZExtValue() & 0x1f;
   2646     if (WidthVal == 0)
   2647       return DAG.getConstant(0, DL, MVT::i32);
   2648 
   2649     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
   2650     if (!Offset)
   2651       break;
   2652 
   2653     SDValue BitsFrom = N->getOperand(0);
   2654     uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
   2655 
   2656     bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
   2657 
   2658     if (OffsetVal == 0) {
   2659       // This is already sign / zero extended, so try to fold away extra BFEs.
   2660       unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
   2661 
   2662       unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
   2663       if (OpSignBits >= SignBits)
   2664         return BitsFrom;
   2665 
   2666       EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
   2667       if (Signed) {
   2668         // This is a sign_extend_inreg. Replace it to take advantage of existing
   2669         // DAG Combines. If not eliminated, we will match back to BFE during
   2670         // selection.
   2671 
   2672         // TODO: The sext_inreg of extended types ends, although we can could
   2673         // handle them in a single BFE.
   2674         return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
   2675                            DAG.getValueType(SmallVT));
   2676       }
   2677 
   2678       return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
   2679     }
   2680 
   2681     if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
   2682       if (Signed) {
   2683         return constantFoldBFE<int32_t>(DAG,
   2684                                         CVal->getSExtValue(),
   2685                                         OffsetVal,
   2686                                         WidthVal,
   2687                                         DL);
   2688       }
   2689 
   2690       return constantFoldBFE<uint32_t>(DAG,
   2691                                        CVal->getZExtValue(),
   2692                                        OffsetVal,
   2693                                        WidthVal,
   2694                                        DL);
   2695     }
   2696 
   2697     if ((OffsetVal + WidthVal) >= 32) {
   2698       SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
   2699       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
   2700                          BitsFrom, ShiftVal);
   2701     }
   2702 
   2703     if (BitsFrom.hasOneUse()) {
   2704       APInt Demanded = APInt::getBitsSet(32,
   2705                                          OffsetVal,
   2706                                          OffsetVal + WidthVal);
   2707 
   2708       APInt KnownZero, KnownOne;
   2709       TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
   2710                                             !DCI.isBeforeLegalizeOps());
   2711       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   2712       if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
   2713           TLI.SimplifyDemandedBits(BitsFrom, Demanded,
   2714                                    KnownZero, KnownOne, TLO)) {
   2715         DCI.CommitTargetLoweringOpt(TLO);
   2716       }
   2717     }
   2718 
   2719     break;
   2720   }
   2721   case ISD::LOAD:
   2722     return performLoadCombine(N, DCI);
   2723   case ISD::STORE:
   2724     return performStoreCombine(N, DCI);
   2725   }
   2726   return SDValue();
   2727 }
   2728 
   2729 //===----------------------------------------------------------------------===//
   2730 // Helper functions
   2731 //===----------------------------------------------------------------------===//
   2732 
   2733 void AMDGPUTargetLowering::getOriginalFunctionArgs(
   2734                                SelectionDAG &DAG,
   2735                                const Function *F,
   2736                                const SmallVectorImpl<ISD::InputArg> &Ins,
   2737                                SmallVectorImpl<ISD::InputArg> &OrigIns) const {
   2738 
   2739   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
   2740     if (Ins[i].ArgVT == Ins[i].VT) {
   2741       OrigIns.push_back(Ins[i]);
   2742       continue;
   2743     }
   2744 
   2745     EVT VT;
   2746     if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) {
   2747       // Vector has been split into scalars.
   2748       VT = Ins[i].ArgVT.getVectorElementType();
   2749     } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() &&
   2750                Ins[i].ArgVT.getVectorElementType() !=
   2751                Ins[i].VT.getVectorElementType()) {
   2752       // Vector elements have been promoted
   2753       VT = Ins[i].ArgVT;
   2754     } else {
   2755       // Vector has been spilt into smaller vectors.
   2756       VT = Ins[i].VT;
   2757     }
   2758 
   2759     ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used,
   2760                       Ins[i].OrigArgIndex, Ins[i].PartOffset);
   2761     OrigIns.push_back(Arg);
   2762   }
   2763 }
   2764 
   2765 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
   2766                                                   const TargetRegisterClass *RC,
   2767                                                    unsigned Reg, EVT VT) const {
   2768   MachineFunction &MF = DAG.getMachineFunction();
   2769   MachineRegisterInfo &MRI = MF.getRegInfo();
   2770   unsigned VirtualRegister;
   2771   if (!MRI.isLiveIn(Reg)) {
   2772     VirtualRegister = MRI.createVirtualRegister(RC);
   2773     MRI.addLiveIn(Reg, VirtualRegister);
   2774   } else {
   2775     VirtualRegister = MRI.getLiveInVirtReg(Reg);
   2776   }
   2777   return DAG.getRegister(VirtualRegister, VT);
   2778 }
   2779 
   2780 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
   2781     const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
   2782   uint64_t ArgOffset = MFI->ABIArgOffset;
   2783   switch (Param) {
   2784   case GRID_DIM:
   2785     return ArgOffset;
   2786   case GRID_OFFSET:
   2787     return ArgOffset + 4;
   2788   }
   2789   llvm_unreachable("unexpected implicit parameter type");
   2790 }
   2791 
   2792 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
   2793 
   2794 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   2795   switch ((AMDGPUISD::NodeType)Opcode) {
   2796   case AMDGPUISD::FIRST_NUMBER: break;
   2797   // AMDIL DAG nodes
   2798   NODE_NAME_CASE(CALL);
   2799   NODE_NAME_CASE(UMUL);
   2800   NODE_NAME_CASE(BRANCH_COND);
   2801 
   2802   // AMDGPU DAG nodes
   2803   NODE_NAME_CASE(ENDPGM)
   2804   NODE_NAME_CASE(RETURN)
   2805   NODE_NAME_CASE(DWORDADDR)
   2806   NODE_NAME_CASE(FRACT)
   2807   NODE_NAME_CASE(CLAMP)
   2808   NODE_NAME_CASE(COS_HW)
   2809   NODE_NAME_CASE(SIN_HW)
   2810   NODE_NAME_CASE(FMAX_LEGACY)
   2811   NODE_NAME_CASE(FMIN_LEGACY)
   2812   NODE_NAME_CASE(FMAX3)
   2813   NODE_NAME_CASE(SMAX3)
   2814   NODE_NAME_CASE(UMAX3)
   2815   NODE_NAME_CASE(FMIN3)
   2816   NODE_NAME_CASE(SMIN3)
   2817   NODE_NAME_CASE(UMIN3)
   2818   NODE_NAME_CASE(FMED3)
   2819   NODE_NAME_CASE(SMED3)
   2820   NODE_NAME_CASE(UMED3)
   2821   NODE_NAME_CASE(URECIP)
   2822   NODE_NAME_CASE(DIV_SCALE)
   2823   NODE_NAME_CASE(DIV_FMAS)
   2824   NODE_NAME_CASE(DIV_FIXUP)
   2825   NODE_NAME_CASE(TRIG_PREOP)
   2826   NODE_NAME_CASE(RCP)
   2827   NODE_NAME_CASE(RSQ)
   2828   NODE_NAME_CASE(RSQ_LEGACY)
   2829   NODE_NAME_CASE(RSQ_CLAMP)
   2830   NODE_NAME_CASE(LDEXP)
   2831   NODE_NAME_CASE(FP_CLASS)
   2832   NODE_NAME_CASE(DOT4)
   2833   NODE_NAME_CASE(CARRY)
   2834   NODE_NAME_CASE(BORROW)
   2835   NODE_NAME_CASE(BFE_U32)
   2836   NODE_NAME_CASE(BFE_I32)
   2837   NODE_NAME_CASE(BFI)
   2838   NODE_NAME_CASE(BFM)
   2839   NODE_NAME_CASE(FFBH_U32)
   2840   NODE_NAME_CASE(MUL_U24)
   2841   NODE_NAME_CASE(MUL_I24)
   2842   NODE_NAME_CASE(MAD_U24)
   2843   NODE_NAME_CASE(MAD_I24)
   2844   NODE_NAME_CASE(TEXTURE_FETCH)
   2845   NODE_NAME_CASE(EXPORT)
   2846   NODE_NAME_CASE(CONST_ADDRESS)
   2847   NODE_NAME_CASE(REGISTER_LOAD)
   2848   NODE_NAME_CASE(REGISTER_STORE)
   2849   NODE_NAME_CASE(LOAD_INPUT)
   2850   NODE_NAME_CASE(SAMPLE)
   2851   NODE_NAME_CASE(SAMPLEB)
   2852   NODE_NAME_CASE(SAMPLED)
   2853   NODE_NAME_CASE(SAMPLEL)
   2854   NODE_NAME_CASE(CVT_F32_UBYTE0)
   2855   NODE_NAME_CASE(CVT_F32_UBYTE1)
   2856   NODE_NAME_CASE(CVT_F32_UBYTE2)
   2857   NODE_NAME_CASE(CVT_F32_UBYTE3)
   2858   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
   2859   NODE_NAME_CASE(CONST_DATA_PTR)
   2860   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
   2861   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
   2862   NODE_NAME_CASE(SENDMSG)
   2863   NODE_NAME_CASE(INTERP_MOV)
   2864   NODE_NAME_CASE(INTERP_P1)
   2865   NODE_NAME_CASE(INTERP_P2)
   2866   NODE_NAME_CASE(STORE_MSKOR)
   2867   NODE_NAME_CASE(LOAD_CONSTANT)
   2868   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
   2869   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
   2870   NODE_NAME_CASE(ATOMIC_INC)
   2871   NODE_NAME_CASE(ATOMIC_DEC)
   2872   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
   2873   }
   2874   return nullptr;
   2875 }
   2876 
   2877 SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand,
   2878                                                DAGCombinerInfo &DCI,
   2879                                                unsigned &RefinementSteps,
   2880                                                bool &UseOneConstNR) const {
   2881   SelectionDAG &DAG = DCI.DAG;
   2882   EVT VT = Operand.getValueType();
   2883 
   2884   if (VT == MVT::f32) {
   2885     RefinementSteps = 0;
   2886     return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
   2887   }
   2888 
   2889   // TODO: There is also f64 rsq instruction, but the documentation is less
   2890   // clear on its precision.
   2891 
   2892   return SDValue();
   2893 }
   2894 
   2895 SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
   2896                                                DAGCombinerInfo &DCI,
   2897                                                unsigned &RefinementSteps) const {
   2898   SelectionDAG &DAG = DCI.DAG;
   2899   EVT VT = Operand.getValueType();
   2900 
   2901   if (VT == MVT::f32) {
   2902     // Reciprocal, < 1 ulp error.
   2903     //
   2904     // This reciprocal approximation converges to < 0.5 ulp error with one
   2905     // newton rhapson performed with two fused multiple adds (FMAs).
   2906 
   2907     RefinementSteps = 0;
   2908     return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
   2909   }
   2910 
   2911   // TODO: There is also f64 rcp instruction, but the documentation is less
   2912   // clear on its precision.
   2913 
   2914   return SDValue();
   2915 }
   2916 
   2917 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
   2918   const SDValue Op,
   2919   APInt &KnownZero,
   2920   APInt &KnownOne,
   2921   const SelectionDAG &DAG,
   2922   unsigned Depth) const {
   2923 
   2924   KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
   2925 
   2926   APInt KnownZero2;
   2927   APInt KnownOne2;
   2928   unsigned Opc = Op.getOpcode();
   2929 
   2930   switch (Opc) {
   2931   default:
   2932     break;
   2933   case AMDGPUISD::CARRY:
   2934   case AMDGPUISD::BORROW: {
   2935     KnownZero = APInt::getHighBitsSet(32, 31);
   2936     break;
   2937   }
   2938 
   2939   case AMDGPUISD::BFE_I32:
   2940   case AMDGPUISD::BFE_U32: {
   2941     ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
   2942     if (!CWidth)
   2943       return;
   2944 
   2945     unsigned BitWidth = 32;
   2946     uint32_t Width = CWidth->getZExtValue() & 0x1f;
   2947 
   2948     if (Opc == AMDGPUISD::BFE_U32)
   2949       KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
   2950 
   2951     break;
   2952   }
   2953   }
   2954 }
   2955 
   2956 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
   2957   SDValue Op,
   2958   const SelectionDAG &DAG,
   2959   unsigned Depth) const {
   2960   switch (Op.getOpcode()) {
   2961   case AMDGPUISD::BFE_I32: {
   2962     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
   2963     if (!Width)
   2964       return 1;
   2965 
   2966     unsigned SignBits = 32 - Width->getZExtValue() + 1;
   2967     if (!isNullConstant(Op.getOperand(1)))
   2968       return SignBits;
   2969 
   2970     // TODO: Could probably figure something out with non-0 offsets.
   2971     unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
   2972     return std::max(SignBits, Op0SignBits);
   2973   }
   2974 
   2975   case AMDGPUISD::BFE_U32: {
   2976     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
   2977     return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
   2978   }
   2979 
   2980   case AMDGPUISD::CARRY:
   2981   case AMDGPUISD::BORROW:
   2982     return 31;
   2983 
   2984   default:
   2985     return 1;
   2986   }
   2987 }
   2988