Home | History | Annotate | Download | only in NVPTX
      1 //
      2 //                     The LLVM Compiler Infrastructure
      3 //
      4 // This file is distributed under the University of Illinois Open Source
      5 // License. See LICENSE.TXT for details.
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a
     10 // selection DAG.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 #include "NVPTXISelLowering.h"
     15 #include "NVPTX.h"
     16 #include "NVPTXTargetMachine.h"
     17 #include "NVPTXTargetObjectFile.h"
     18 #include "NVPTXUtilities.h"
     19 #include "llvm/CodeGen/Analysis.h"
     20 #include "llvm/CodeGen/MachineFrameInfo.h"
     21 #include "llvm/CodeGen/MachineFunction.h"
     22 #include "llvm/CodeGen/MachineInstrBuilder.h"
     23 #include "llvm/CodeGen/MachineRegisterInfo.h"
     24 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
     25 #include "llvm/IR/CallSite.h"
     26 #include "llvm/IR/DerivedTypes.h"
     27 #include "llvm/IR/Function.h"
     28 #include "llvm/IR/GlobalValue.h"
     29 #include "llvm/IR/IntrinsicInst.h"
     30 #include "llvm/IR/Intrinsics.h"
     31 #include "llvm/IR/Module.h"
     32 #include "llvm/MC/MCSectionELF.h"
     33 #include "llvm/Support/CommandLine.h"
     34 #include "llvm/Support/Debug.h"
     35 #include "llvm/Support/ErrorHandling.h"
     36 #include "llvm/Support/MathExtras.h"
     37 #include "llvm/Support/raw_ostream.h"
     38 #include <sstream>
     39 
     40 #undef DEBUG_TYPE
     41 #define DEBUG_TYPE "nvptx-lower"
     42 
     43 using namespace llvm;
     44 
     45 static unsigned int uniqueCallSite = 0;
     46 
     47 static cl::opt<bool> sched4reg(
     48     "nvptx-sched4reg",
     49     cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
     50 
     51 static cl::opt<unsigned>
     52 FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
     53                     cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
     54                              " 1: do it  2: do it aggressively"),
     55                     cl::init(2));
     56 
     57 static bool IsPTXVectorType(MVT VT) {
     58   switch (VT.SimpleTy) {
     59   default:
     60     return false;
     61   case MVT::v2i1:
     62   case MVT::v4i1:
     63   case MVT::v2i8:
     64   case MVT::v4i8:
     65   case MVT::v2i16:
     66   case MVT::v4i16:
     67   case MVT::v2i32:
     68   case MVT::v4i32:
     69   case MVT::v2i64:
     70   case MVT::v2f32:
     71   case MVT::v4f32:
     72   case MVT::v2f64:
     73     return true;
     74   }
     75 }
     76 
     77 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
     78 /// EVTs that compose it.  Unlike ComputeValueVTs, this will break apart vectors
     79 /// into their primitive components.
     80 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
     81 /// same number of types as the Ins/Outs arrays in LowerFormalArguments,
     82 /// LowerCall, and LowerReturn.
     83 static void ComputePTXValueVTs(const TargetLowering &TLI, Type *Ty,
     84                                SmallVectorImpl<EVT> &ValueVTs,
     85                                SmallVectorImpl<uint64_t> *Offsets = nullptr,
     86                                uint64_t StartingOffset = 0) {
     87   SmallVector<EVT, 16> TempVTs;
     88   SmallVector<uint64_t, 16> TempOffsets;
     89 
     90   ComputeValueVTs(TLI, Ty, TempVTs, &TempOffsets, StartingOffset);
     91   for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
     92     EVT VT = TempVTs[i];
     93     uint64_t Off = TempOffsets[i];
     94     if (VT.isVector())
     95       for (unsigned j = 0, je = VT.getVectorNumElements(); j != je; ++j) {
     96         ValueVTs.push_back(VT.getVectorElementType());
     97         if (Offsets)
     98           Offsets->push_back(Off+j*VT.getVectorElementType().getStoreSize());
     99       }
    100     else {
    101       ValueVTs.push_back(VT);
    102       if (Offsets)
    103         Offsets->push_back(Off);
    104     }
    105   }
    106 }
    107 
    108 // NVPTXTargetLowering Constructor.
    109 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
    110                                          const NVPTXSubtarget &STI)
    111     : TargetLowering(TM), nvTM(&TM), STI(STI) {
    112 
    113   // always lower memset, memcpy, and memmove intrinsics to load/store
    114   // instructions, rather
    115   // then generating calls to memset, mempcy or memmove.
    116   MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
    117   MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
    118   MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
    119 
    120   setBooleanContents(ZeroOrNegativeOneBooleanContent);
    121   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
    122 
    123   // Jump is Expensive. Don't create extra control flow for 'and', 'or'
    124   // condition branches.
    125   setJumpIsExpensive(true);
    126 
    127   // By default, use the Source scheduling
    128   if (sched4reg)
    129     setSchedulingPreference(Sched::RegPressure);
    130   else
    131     setSchedulingPreference(Sched::Source);
    132 
    133   addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
    134   addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
    135   addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
    136   addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
    137   addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
    138   addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
    139 
    140   // Operations not directly supported by NVPTX.
    141   setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
    142   setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
    143   setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
    144   setOperationAction(ISD::SELECT_CC, MVT::i8, Expand);
    145   setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
    146   setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
    147   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
    148   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
    149   setOperationAction(ISD::BR_CC, MVT::f64, Expand);
    150   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
    151   setOperationAction(ISD::BR_CC, MVT::i8, Expand);
    152   setOperationAction(ISD::BR_CC, MVT::i16, Expand);
    153   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
    154   setOperationAction(ISD::BR_CC, MVT::i64, Expand);
    155   // Some SIGN_EXTEND_INREG can be done using cvt instruction.
    156   // For others we will expand to a SHL/SRA pair.
    157   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
    158   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
    159   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
    160   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
    161   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
    162 
    163   setOperationAction(ISD::SHL_PARTS, MVT::i32  , Custom);
    164   setOperationAction(ISD::SRA_PARTS, MVT::i32  , Custom);
    165   setOperationAction(ISD::SRL_PARTS, MVT::i32  , Custom);
    166   setOperationAction(ISD::SHL_PARTS, MVT::i64  , Custom);
    167   setOperationAction(ISD::SRA_PARTS, MVT::i64  , Custom);
    168   setOperationAction(ISD::SRL_PARTS, MVT::i64  , Custom);
    169 
    170   if (STI.hasROT64()) {
    171     setOperationAction(ISD::ROTL, MVT::i64, Legal);
    172     setOperationAction(ISD::ROTR, MVT::i64, Legal);
    173   } else {
    174     setOperationAction(ISD::ROTL, MVT::i64, Expand);
    175     setOperationAction(ISD::ROTR, MVT::i64, Expand);
    176   }
    177   if (STI.hasROT32()) {
    178     setOperationAction(ISD::ROTL, MVT::i32, Legal);
    179     setOperationAction(ISD::ROTR, MVT::i32, Legal);
    180   } else {
    181     setOperationAction(ISD::ROTL, MVT::i32, Expand);
    182     setOperationAction(ISD::ROTR, MVT::i32, Expand);
    183   }
    184 
    185   setOperationAction(ISD::ROTL, MVT::i16, Expand);
    186   setOperationAction(ISD::ROTR, MVT::i16, Expand);
    187   setOperationAction(ISD::ROTL, MVT::i8, Expand);
    188   setOperationAction(ISD::ROTR, MVT::i8, Expand);
    189   setOperationAction(ISD::BSWAP, MVT::i16, Expand);
    190   setOperationAction(ISD::BSWAP, MVT::i32, Expand);
    191   setOperationAction(ISD::BSWAP, MVT::i64, Expand);
    192 
    193   // Indirect branch is not supported.
    194   // This also disables Jump Table creation.
    195   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
    196   setOperationAction(ISD::BRIND, MVT::Other, Expand);
    197 
    198   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
    199   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
    200 
    201   // We want to legalize constant related memmove and memcopy
    202   // intrinsics.
    203   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
    204 
    205   // Turn FP extload into load/fextend
    206   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
    207   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
    208   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
    209   // Turn FP truncstore into trunc + store.
    210   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
    211   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
    212   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
    213 
    214   // PTX does not support load / store predicate registers
    215   setOperationAction(ISD::LOAD, MVT::i1, Custom);
    216   setOperationAction(ISD::STORE, MVT::i1, Custom);
    217 
    218   for (MVT VT : MVT::integer_valuetypes()) {
    219     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
    220     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
    221     setTruncStoreAction(VT, MVT::i1, Expand);
    222   }
    223 
    224   // This is legal in NVPTX
    225   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
    226   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
    227 
    228   // TRAP can be lowered to PTX trap
    229   setOperationAction(ISD::TRAP, MVT::Other, Legal);
    230 
    231   setOperationAction(ISD::ADDC, MVT::i64, Expand);
    232   setOperationAction(ISD::ADDE, MVT::i64, Expand);
    233 
    234   // Register custom handling for vector loads/stores
    235   for (MVT VT : MVT::vector_valuetypes()) {
    236     if (IsPTXVectorType(VT)) {
    237       setOperationAction(ISD::LOAD, VT, Custom);
    238       setOperationAction(ISD::STORE, VT, Custom);
    239       setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
    240     }
    241   }
    242 
    243   // Custom handling for i8 intrinsics
    244   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
    245 
    246   setOperationAction(ISD::CTLZ, MVT::i16, Legal);
    247   setOperationAction(ISD::CTLZ, MVT::i32, Legal);
    248   setOperationAction(ISD::CTLZ, MVT::i64, Legal);
    249   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Legal);
    250   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Legal);
    251   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Legal);
    252   setOperationAction(ISD::CTTZ, MVT::i16, Expand);
    253   setOperationAction(ISD::CTTZ, MVT::i32, Expand);
    254   setOperationAction(ISD::CTTZ, MVT::i64, Expand);
    255   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Expand);
    256   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
    257   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
    258   setOperationAction(ISD::CTPOP, MVT::i16, Legal);
    259   setOperationAction(ISD::CTPOP, MVT::i32, Legal);
    260   setOperationAction(ISD::CTPOP, MVT::i64, Legal);
    261 
    262   // PTX does not directly support SELP of i1, so promote to i32 first
    263   setOperationAction(ISD::SELECT, MVT::i1, Custom);
    264 
    265   // We have some custom DAG combine patterns for these nodes
    266   setTargetDAGCombine(ISD::ADD);
    267   setTargetDAGCombine(ISD::AND);
    268   setTargetDAGCombine(ISD::FADD);
    269   setTargetDAGCombine(ISD::MUL);
    270   setTargetDAGCombine(ISD::SHL);
    271 
    272   // Now deduce the information based on the above mentioned
    273   // actions
    274   computeRegisterProperties(STI.getRegisterInfo());
    275 }
    276 
    277 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
    278   switch (Opcode) {
    279   default:
    280     return nullptr;
    281   case NVPTXISD::CALL:
    282     return "NVPTXISD::CALL";
    283   case NVPTXISD::RET_FLAG:
    284     return "NVPTXISD::RET_FLAG";
    285   case NVPTXISD::Wrapper:
    286     return "NVPTXISD::Wrapper";
    287   case NVPTXISD::DeclareParam:
    288     return "NVPTXISD::DeclareParam";
    289   case NVPTXISD::DeclareScalarParam:
    290     return "NVPTXISD::DeclareScalarParam";
    291   case NVPTXISD::DeclareRet:
    292     return "NVPTXISD::DeclareRet";
    293   case NVPTXISD::DeclareRetParam:
    294     return "NVPTXISD::DeclareRetParam";
    295   case NVPTXISD::PrintCall:
    296     return "NVPTXISD::PrintCall";
    297   case NVPTXISD::LoadParam:
    298     return "NVPTXISD::LoadParam";
    299   case NVPTXISD::LoadParamV2:
    300     return "NVPTXISD::LoadParamV2";
    301   case NVPTXISD::LoadParamV4:
    302     return "NVPTXISD::LoadParamV4";
    303   case NVPTXISD::StoreParam:
    304     return "NVPTXISD::StoreParam";
    305   case NVPTXISD::StoreParamV2:
    306     return "NVPTXISD::StoreParamV2";
    307   case NVPTXISD::StoreParamV4:
    308     return "NVPTXISD::StoreParamV4";
    309   case NVPTXISD::StoreParamS32:
    310     return "NVPTXISD::StoreParamS32";
    311   case NVPTXISD::StoreParamU32:
    312     return "NVPTXISD::StoreParamU32";
    313   case NVPTXISD::CallArgBegin:
    314     return "NVPTXISD::CallArgBegin";
    315   case NVPTXISD::CallArg:
    316     return "NVPTXISD::CallArg";
    317   case NVPTXISD::LastCallArg:
    318     return "NVPTXISD::LastCallArg";
    319   case NVPTXISD::CallArgEnd:
    320     return "NVPTXISD::CallArgEnd";
    321   case NVPTXISD::CallVoid:
    322     return "NVPTXISD::CallVoid";
    323   case NVPTXISD::CallVal:
    324     return "NVPTXISD::CallVal";
    325   case NVPTXISD::CallSymbol:
    326     return "NVPTXISD::CallSymbol";
    327   case NVPTXISD::Prototype:
    328     return "NVPTXISD::Prototype";
    329   case NVPTXISD::MoveParam:
    330     return "NVPTXISD::MoveParam";
    331   case NVPTXISD::StoreRetval:
    332     return "NVPTXISD::StoreRetval";
    333   case NVPTXISD::StoreRetvalV2:
    334     return "NVPTXISD::StoreRetvalV2";
    335   case NVPTXISD::StoreRetvalV4:
    336     return "NVPTXISD::StoreRetvalV4";
    337   case NVPTXISD::PseudoUseParam:
    338     return "NVPTXISD::PseudoUseParam";
    339   case NVPTXISD::RETURN:
    340     return "NVPTXISD::RETURN";
    341   case NVPTXISD::CallSeqBegin:
    342     return "NVPTXISD::CallSeqBegin";
    343   case NVPTXISD::CallSeqEnd:
    344     return "NVPTXISD::CallSeqEnd";
    345   case NVPTXISD::CallPrototype:
    346     return "NVPTXISD::CallPrototype";
    347   case NVPTXISD::LoadV2:
    348     return "NVPTXISD::LoadV2";
    349   case NVPTXISD::LoadV4:
    350     return "NVPTXISD::LoadV4";
    351   case NVPTXISD::LDGV2:
    352     return "NVPTXISD::LDGV2";
    353   case NVPTXISD::LDGV4:
    354     return "NVPTXISD::LDGV4";
    355   case NVPTXISD::LDUV2:
    356     return "NVPTXISD::LDUV2";
    357   case NVPTXISD::LDUV4:
    358     return "NVPTXISD::LDUV4";
    359   case NVPTXISD::StoreV2:
    360     return "NVPTXISD::StoreV2";
    361   case NVPTXISD::StoreV4:
    362     return "NVPTXISD::StoreV4";
    363   case NVPTXISD::FUN_SHFL_CLAMP:
    364     return "NVPTXISD::FUN_SHFL_CLAMP";
    365   case NVPTXISD::FUN_SHFR_CLAMP:
    366     return "NVPTXISD::FUN_SHFR_CLAMP";
    367   case NVPTXISD::IMAD:
    368     return "NVPTXISD::IMAD";
    369   case NVPTXISD::MUL_WIDE_SIGNED:
    370     return "NVPTXISD::MUL_WIDE_SIGNED";
    371   case NVPTXISD::MUL_WIDE_UNSIGNED:
    372     return "NVPTXISD::MUL_WIDE_UNSIGNED";
    373   case NVPTXISD::Tex1DFloatS32:        return "NVPTXISD::Tex1DFloatS32";
    374   case NVPTXISD::Tex1DFloatFloat:      return "NVPTXISD::Tex1DFloatFloat";
    375   case NVPTXISD::Tex1DFloatFloatLevel:
    376     return "NVPTXISD::Tex1DFloatFloatLevel";
    377   case NVPTXISD::Tex1DFloatFloatGrad:
    378     return "NVPTXISD::Tex1DFloatFloatGrad";
    379   case NVPTXISD::Tex1DS32S32:          return "NVPTXISD::Tex1DS32S32";
    380   case NVPTXISD::Tex1DS32Float:        return "NVPTXISD::Tex1DS32Float";
    381   case NVPTXISD::Tex1DS32FloatLevel:
    382     return "NVPTXISD::Tex1DS32FloatLevel";
    383   case NVPTXISD::Tex1DS32FloatGrad:
    384     return "NVPTXISD::Tex1DS32FloatGrad";
    385   case NVPTXISD::Tex1DU32S32:          return "NVPTXISD::Tex1DU32S32";
    386   case NVPTXISD::Tex1DU32Float:        return "NVPTXISD::Tex1DU32Float";
    387   case NVPTXISD::Tex1DU32FloatLevel:
    388     return "NVPTXISD::Tex1DU32FloatLevel";
    389   case NVPTXISD::Tex1DU32FloatGrad:
    390     return "NVPTXISD::Tex1DU32FloatGrad";
    391   case NVPTXISD::Tex1DArrayFloatS32:   return "NVPTXISD::Tex1DArrayFloatS32";
    392   case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
    393   case NVPTXISD::Tex1DArrayFloatFloatLevel:
    394     return "NVPTXISD::Tex1DArrayFloatFloatLevel";
    395   case NVPTXISD::Tex1DArrayFloatFloatGrad:
    396     return "NVPTXISD::Tex1DArrayFloatFloatGrad";
    397   case NVPTXISD::Tex1DArrayS32S32:     return "NVPTXISD::Tex1DArrayS32S32";
    398   case NVPTXISD::Tex1DArrayS32Float:   return "NVPTXISD::Tex1DArrayS32Float";
    399   case NVPTXISD::Tex1DArrayS32FloatLevel:
    400     return "NVPTXISD::Tex1DArrayS32FloatLevel";
    401   case NVPTXISD::Tex1DArrayS32FloatGrad:
    402     return "NVPTXISD::Tex1DArrayS32FloatGrad";
    403   case NVPTXISD::Tex1DArrayU32S32:     return "NVPTXISD::Tex1DArrayU32S32";
    404   case NVPTXISD::Tex1DArrayU32Float:   return "NVPTXISD::Tex1DArrayU32Float";
    405   case NVPTXISD::Tex1DArrayU32FloatLevel:
    406     return "NVPTXISD::Tex1DArrayU32FloatLevel";
    407   case NVPTXISD::Tex1DArrayU32FloatGrad:
    408     return "NVPTXISD::Tex1DArrayU32FloatGrad";
    409   case NVPTXISD::Tex2DFloatS32:        return "NVPTXISD::Tex2DFloatS32";
    410   case NVPTXISD::Tex2DFloatFloat:      return "NVPTXISD::Tex2DFloatFloat";
    411   case NVPTXISD::Tex2DFloatFloatLevel:
    412     return "NVPTXISD::Tex2DFloatFloatLevel";
    413   case NVPTXISD::Tex2DFloatFloatGrad:
    414     return "NVPTXISD::Tex2DFloatFloatGrad";
    415   case NVPTXISD::Tex2DS32S32:          return "NVPTXISD::Tex2DS32S32";
    416   case NVPTXISD::Tex2DS32Float:        return "NVPTXISD::Tex2DS32Float";
    417   case NVPTXISD::Tex2DS32FloatLevel:
    418     return "NVPTXISD::Tex2DS32FloatLevel";
    419   case NVPTXISD::Tex2DS32FloatGrad:
    420     return "NVPTXISD::Tex2DS32FloatGrad";
    421   case NVPTXISD::Tex2DU32S32:          return "NVPTXISD::Tex2DU32S32";
    422   case NVPTXISD::Tex2DU32Float:        return "NVPTXISD::Tex2DU32Float";
    423   case NVPTXISD::Tex2DU32FloatLevel:
    424     return "NVPTXISD::Tex2DU32FloatLevel";
    425   case NVPTXISD::Tex2DU32FloatGrad:
    426     return "NVPTXISD::Tex2DU32FloatGrad";
    427   case NVPTXISD::Tex2DArrayFloatS32:   return "NVPTXISD::Tex2DArrayFloatS32";
    428   case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
    429   case NVPTXISD::Tex2DArrayFloatFloatLevel:
    430     return "NVPTXISD::Tex2DArrayFloatFloatLevel";
    431   case NVPTXISD::Tex2DArrayFloatFloatGrad:
    432     return "NVPTXISD::Tex2DArrayFloatFloatGrad";
    433   case NVPTXISD::Tex2DArrayS32S32:     return "NVPTXISD::Tex2DArrayS32S32";
    434   case NVPTXISD::Tex2DArrayS32Float:   return "NVPTXISD::Tex2DArrayS32Float";
    435   case NVPTXISD::Tex2DArrayS32FloatLevel:
    436     return "NVPTXISD::Tex2DArrayS32FloatLevel";
    437   case NVPTXISD::Tex2DArrayS32FloatGrad:
    438     return "NVPTXISD::Tex2DArrayS32FloatGrad";
    439   case NVPTXISD::Tex2DArrayU32S32:     return "NVPTXISD::Tex2DArrayU32S32";
    440   case NVPTXISD::Tex2DArrayU32Float:   return "NVPTXISD::Tex2DArrayU32Float";
    441   case NVPTXISD::Tex2DArrayU32FloatLevel:
    442     return "NVPTXISD::Tex2DArrayU32FloatLevel";
    443   case NVPTXISD::Tex2DArrayU32FloatGrad:
    444     return "NVPTXISD::Tex2DArrayU32FloatGrad";
    445   case NVPTXISD::Tex3DFloatS32:        return "NVPTXISD::Tex3DFloatS32";
    446   case NVPTXISD::Tex3DFloatFloat:      return "NVPTXISD::Tex3DFloatFloat";
    447   case NVPTXISD::Tex3DFloatFloatLevel:
    448     return "NVPTXISD::Tex3DFloatFloatLevel";
    449   case NVPTXISD::Tex3DFloatFloatGrad:
    450     return "NVPTXISD::Tex3DFloatFloatGrad";
    451   case NVPTXISD::Tex3DS32S32:          return "NVPTXISD::Tex3DS32S32";
    452   case NVPTXISD::Tex3DS32Float:        return "NVPTXISD::Tex3DS32Float";
    453   case NVPTXISD::Tex3DS32FloatLevel:
    454     return "NVPTXISD::Tex3DS32FloatLevel";
    455   case NVPTXISD::Tex3DS32FloatGrad:
    456     return "NVPTXISD::Tex3DS32FloatGrad";
    457   case NVPTXISD::Tex3DU32S32:          return "NVPTXISD::Tex3DU32S32";
    458   case NVPTXISD::Tex3DU32Float:        return "NVPTXISD::Tex3DU32Float";
    459   case NVPTXISD::Tex3DU32FloatLevel:
    460     return "NVPTXISD::Tex3DU32FloatLevel";
    461   case NVPTXISD::Tex3DU32FloatGrad:
    462     return "NVPTXISD::Tex3DU32FloatGrad";
    463   case NVPTXISD::TexCubeFloatFloat:      return "NVPTXISD::TexCubeFloatFloat";
    464   case NVPTXISD::TexCubeFloatFloatLevel:
    465     return "NVPTXISD::TexCubeFloatFloatLevel";
    466   case NVPTXISD::TexCubeS32Float:        return "NVPTXISD::TexCubeS32Float";
    467   case NVPTXISD::TexCubeS32FloatLevel:
    468     return "NVPTXISD::TexCubeS32FloatLevel";
    469   case NVPTXISD::TexCubeU32Float:        return "NVPTXISD::TexCubeU32Float";
    470   case NVPTXISD::TexCubeU32FloatLevel:
    471     return "NVPTXISD::TexCubeU32FloatLevel";
    472   case NVPTXISD::TexCubeArrayFloatFloat:
    473     return "NVPTXISD::TexCubeArrayFloatFloat";
    474   case NVPTXISD::TexCubeArrayFloatFloatLevel:
    475     return "NVPTXISD::TexCubeArrayFloatFloatLevel";
    476   case NVPTXISD::TexCubeArrayS32Float:
    477     return "NVPTXISD::TexCubeArrayS32Float";
    478   case NVPTXISD::TexCubeArrayS32FloatLevel:
    479     return "NVPTXISD::TexCubeArrayS32FloatLevel";
    480   case NVPTXISD::TexCubeArrayU32Float:
    481     return "NVPTXISD::TexCubeArrayU32Float";
    482   case NVPTXISD::TexCubeArrayU32FloatLevel:
    483     return "NVPTXISD::TexCubeArrayU32FloatLevel";
    484   case NVPTXISD::Tld4R2DFloatFloat:
    485     return "NVPTXISD::Tld4R2DFloatFloat";
    486   case NVPTXISD::Tld4G2DFloatFloat:
    487     return "NVPTXISD::Tld4G2DFloatFloat";
    488   case NVPTXISD::Tld4B2DFloatFloat:
    489     return "NVPTXISD::Tld4B2DFloatFloat";
    490   case NVPTXISD::Tld4A2DFloatFloat:
    491     return "NVPTXISD::Tld4A2DFloatFloat";
    492   case NVPTXISD::Tld4R2DS64Float:
    493     return "NVPTXISD::Tld4R2DS64Float";
    494   case NVPTXISD::Tld4G2DS64Float:
    495     return "NVPTXISD::Tld4G2DS64Float";
    496   case NVPTXISD::Tld4B2DS64Float:
    497     return "NVPTXISD::Tld4B2DS64Float";
    498   case NVPTXISD::Tld4A2DS64Float:
    499     return "NVPTXISD::Tld4A2DS64Float";
    500   case NVPTXISD::Tld4R2DU64Float:
    501     return "NVPTXISD::Tld4R2DU64Float";
    502   case NVPTXISD::Tld4G2DU64Float:
    503     return "NVPTXISD::Tld4G2DU64Float";
    504   case NVPTXISD::Tld4B2DU64Float:
    505     return "NVPTXISD::Tld4B2DU64Float";
    506   case NVPTXISD::Tld4A2DU64Float:
    507     return "NVPTXISD::Tld4A2DU64Float";
    508 
    509   case NVPTXISD::TexUnified1DFloatS32:
    510     return "NVPTXISD::TexUnified1DFloatS32";
    511   case NVPTXISD::TexUnified1DFloatFloat:
    512     return "NVPTXISD::TexUnified1DFloatFloat";
    513   case NVPTXISD::TexUnified1DFloatFloatLevel:
    514     return "NVPTXISD::TexUnified1DFloatFloatLevel";
    515   case NVPTXISD::TexUnified1DFloatFloatGrad:
    516     return "NVPTXISD::TexUnified1DFloatFloatGrad";
    517   case NVPTXISD::TexUnified1DS32S32:
    518     return "NVPTXISD::TexUnified1DS32S32";
    519   case NVPTXISD::TexUnified1DS32Float:
    520     return "NVPTXISD::TexUnified1DS32Float";
    521   case NVPTXISD::TexUnified1DS32FloatLevel:
    522     return "NVPTXISD::TexUnified1DS32FloatLevel";
    523   case NVPTXISD::TexUnified1DS32FloatGrad:
    524     return "NVPTXISD::TexUnified1DS32FloatGrad";
    525   case NVPTXISD::TexUnified1DU32S32:
    526     return "NVPTXISD::TexUnified1DU32S32";
    527   case NVPTXISD::TexUnified1DU32Float:
    528     return "NVPTXISD::TexUnified1DU32Float";
    529   case NVPTXISD::TexUnified1DU32FloatLevel:
    530     return "NVPTXISD::TexUnified1DU32FloatLevel";
    531   case NVPTXISD::TexUnified1DU32FloatGrad:
    532     return "NVPTXISD::TexUnified1DU32FloatGrad";
    533   case NVPTXISD::TexUnified1DArrayFloatS32:
    534     return "NVPTXISD::TexUnified1DArrayFloatS32";
    535   case NVPTXISD::TexUnified1DArrayFloatFloat:
    536     return "NVPTXISD::TexUnified1DArrayFloatFloat";
    537   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
    538     return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
    539   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
    540     return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
    541   case NVPTXISD::TexUnified1DArrayS32S32:
    542     return "NVPTXISD::TexUnified1DArrayS32S32";
    543   case NVPTXISD::TexUnified1DArrayS32Float:
    544     return "NVPTXISD::TexUnified1DArrayS32Float";
    545   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
    546     return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
    547   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
    548     return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
    549   case NVPTXISD::TexUnified1DArrayU32S32:
    550     return "NVPTXISD::TexUnified1DArrayU32S32";
    551   case NVPTXISD::TexUnified1DArrayU32Float:
    552     return "NVPTXISD::TexUnified1DArrayU32Float";
    553   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
    554     return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
    555   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
    556     return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
    557   case NVPTXISD::TexUnified2DFloatS32:
    558     return "NVPTXISD::TexUnified2DFloatS32";
    559   case NVPTXISD::TexUnified2DFloatFloat:
    560     return "NVPTXISD::TexUnified2DFloatFloat";
    561   case NVPTXISD::TexUnified2DFloatFloatLevel:
    562     return "NVPTXISD::TexUnified2DFloatFloatLevel";
    563   case NVPTXISD::TexUnified2DFloatFloatGrad:
    564     return "NVPTXISD::TexUnified2DFloatFloatGrad";
    565   case NVPTXISD::TexUnified2DS32S32:
    566     return "NVPTXISD::TexUnified2DS32S32";
    567   case NVPTXISD::TexUnified2DS32Float:
    568     return "NVPTXISD::TexUnified2DS32Float";
    569   case NVPTXISD::TexUnified2DS32FloatLevel:
    570     return "NVPTXISD::TexUnified2DS32FloatLevel";
    571   case NVPTXISD::TexUnified2DS32FloatGrad:
    572     return "NVPTXISD::TexUnified2DS32FloatGrad";
    573   case NVPTXISD::TexUnified2DU32S32:
    574     return "NVPTXISD::TexUnified2DU32S32";
    575   case NVPTXISD::TexUnified2DU32Float:
    576     return "NVPTXISD::TexUnified2DU32Float";
    577   case NVPTXISD::TexUnified2DU32FloatLevel:
    578     return "NVPTXISD::TexUnified2DU32FloatLevel";
    579   case NVPTXISD::TexUnified2DU32FloatGrad:
    580     return "NVPTXISD::TexUnified2DU32FloatGrad";
    581   case NVPTXISD::TexUnified2DArrayFloatS32:
    582     return "NVPTXISD::TexUnified2DArrayFloatS32";
    583   case NVPTXISD::TexUnified2DArrayFloatFloat:
    584     return "NVPTXISD::TexUnified2DArrayFloatFloat";
    585   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
    586     return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
    587   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
    588     return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
    589   case NVPTXISD::TexUnified2DArrayS32S32:
    590     return "NVPTXISD::TexUnified2DArrayS32S32";
    591   case NVPTXISD::TexUnified2DArrayS32Float:
    592     return "NVPTXISD::TexUnified2DArrayS32Float";
    593   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
    594     return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
    595   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
    596     return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
    597   case NVPTXISD::TexUnified2DArrayU32S32:
    598     return "NVPTXISD::TexUnified2DArrayU32S32";
    599   case NVPTXISD::TexUnified2DArrayU32Float:
    600     return "NVPTXISD::TexUnified2DArrayU32Float";
    601   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
    602     return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
    603   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
    604     return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
    605   case NVPTXISD::TexUnified3DFloatS32:
    606     return "NVPTXISD::TexUnified3DFloatS32";
    607   case NVPTXISD::TexUnified3DFloatFloat:
    608     return "NVPTXISD::TexUnified3DFloatFloat";
    609   case NVPTXISD::TexUnified3DFloatFloatLevel:
    610     return "NVPTXISD::TexUnified3DFloatFloatLevel";
    611   case NVPTXISD::TexUnified3DFloatFloatGrad:
    612     return "NVPTXISD::TexUnified3DFloatFloatGrad";
    613   case NVPTXISD::TexUnified3DS32S32:
    614     return "NVPTXISD::TexUnified3DS32S32";
    615   case NVPTXISD::TexUnified3DS32Float:
    616     return "NVPTXISD::TexUnified3DS32Float";
    617   case NVPTXISD::TexUnified3DS32FloatLevel:
    618     return "NVPTXISD::TexUnified3DS32FloatLevel";
    619   case NVPTXISD::TexUnified3DS32FloatGrad:
    620     return "NVPTXISD::TexUnified3DS32FloatGrad";
    621   case NVPTXISD::TexUnified3DU32S32:
    622     return "NVPTXISD::TexUnified3DU32S32";
    623   case NVPTXISD::TexUnified3DU32Float:
    624     return "NVPTXISD::TexUnified3DU32Float";
    625   case NVPTXISD::TexUnified3DU32FloatLevel:
    626     return "NVPTXISD::TexUnified3DU32FloatLevel";
    627   case NVPTXISD::TexUnified3DU32FloatGrad:
    628     return "NVPTXISD::TexUnified3DU32FloatGrad";
    629   case NVPTXISD::TexUnifiedCubeFloatFloat:
    630     return "NVPTXISD::TexUnifiedCubeFloatFloat";
    631   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
    632     return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
    633   case NVPTXISD::TexUnifiedCubeS32Float:
    634     return "NVPTXISD::TexUnifiedCubeS32Float";
    635   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
    636     return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
    637   case NVPTXISD::TexUnifiedCubeU32Float:
    638     return "NVPTXISD::TexUnifiedCubeU32Float";
    639   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
    640     return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
    641   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
    642     return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
    643   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
    644     return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
    645   case NVPTXISD::TexUnifiedCubeArrayS32Float:
    646     return "NVPTXISD::TexUnifiedCubeArrayS32Float";
    647   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
    648     return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
    649   case NVPTXISD::TexUnifiedCubeArrayU32Float:
    650     return "NVPTXISD::TexUnifiedCubeArrayU32Float";
    651   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
    652     return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
    653   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
    654     return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
    655   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
    656     return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
    657   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
    658     return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
    659   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
    660     return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
    661   case NVPTXISD::Tld4UnifiedR2DS64Float:
    662     return "NVPTXISD::Tld4UnifiedR2DS64Float";
    663   case NVPTXISD::Tld4UnifiedG2DS64Float:
    664     return "NVPTXISD::Tld4UnifiedG2DS64Float";
    665   case NVPTXISD::Tld4UnifiedB2DS64Float:
    666     return "NVPTXISD::Tld4UnifiedB2DS64Float";
    667   case NVPTXISD::Tld4UnifiedA2DS64Float:
    668     return "NVPTXISD::Tld4UnifiedA2DS64Float";
    669   case NVPTXISD::Tld4UnifiedR2DU64Float:
    670     return "NVPTXISD::Tld4UnifiedR2DU64Float";
    671   case NVPTXISD::Tld4UnifiedG2DU64Float:
    672     return "NVPTXISD::Tld4UnifiedG2DU64Float";
    673   case NVPTXISD::Tld4UnifiedB2DU64Float:
    674     return "NVPTXISD::Tld4UnifiedB2DU64Float";
    675   case NVPTXISD::Tld4UnifiedA2DU64Float:
    676     return "NVPTXISD::Tld4UnifiedA2DU64Float";
    677 
    678   case NVPTXISD::Suld1DI8Clamp:          return "NVPTXISD::Suld1DI8Clamp";
    679   case NVPTXISD::Suld1DI16Clamp:         return "NVPTXISD::Suld1DI16Clamp";
    680   case NVPTXISD::Suld1DI32Clamp:         return "NVPTXISD::Suld1DI32Clamp";
    681   case NVPTXISD::Suld1DI64Clamp:         return "NVPTXISD::Suld1DI64Clamp";
    682   case NVPTXISD::Suld1DV2I8Clamp:        return "NVPTXISD::Suld1DV2I8Clamp";
    683   case NVPTXISD::Suld1DV2I16Clamp:       return "NVPTXISD::Suld1DV2I16Clamp";
    684   case NVPTXISD::Suld1DV2I32Clamp:       return "NVPTXISD::Suld1DV2I32Clamp";
    685   case NVPTXISD::Suld1DV2I64Clamp:       return "NVPTXISD::Suld1DV2I64Clamp";
    686   case NVPTXISD::Suld1DV4I8Clamp:        return "NVPTXISD::Suld1DV4I8Clamp";
    687   case NVPTXISD::Suld1DV4I16Clamp:       return "NVPTXISD::Suld1DV4I16Clamp";
    688   case NVPTXISD::Suld1DV4I32Clamp:       return "NVPTXISD::Suld1DV4I32Clamp";
    689 
    690   case NVPTXISD::Suld1DArrayI8Clamp:   return "NVPTXISD::Suld1DArrayI8Clamp";
    691   case NVPTXISD::Suld1DArrayI16Clamp:  return "NVPTXISD::Suld1DArrayI16Clamp";
    692   case NVPTXISD::Suld1DArrayI32Clamp:  return "NVPTXISD::Suld1DArrayI32Clamp";
    693   case NVPTXISD::Suld1DArrayI64Clamp:  return "NVPTXISD::Suld1DArrayI64Clamp";
    694   case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
    695   case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
    696   case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
    697   case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
    698   case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
    699   case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
    700   case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
    701 
    702   case NVPTXISD::Suld2DI8Clamp:          return "NVPTXISD::Suld2DI8Clamp";
    703   case NVPTXISD::Suld2DI16Clamp:         return "NVPTXISD::Suld2DI16Clamp";
    704   case NVPTXISD::Suld2DI32Clamp:         return "NVPTXISD::Suld2DI32Clamp";
    705   case NVPTXISD::Suld2DI64Clamp:         return "NVPTXISD::Suld2DI64Clamp";
    706   case NVPTXISD::Suld2DV2I8Clamp:        return "NVPTXISD::Suld2DV2I8Clamp";
    707   case NVPTXISD::Suld2DV2I16Clamp:       return "NVPTXISD::Suld2DV2I16Clamp";
    708   case NVPTXISD::Suld2DV2I32Clamp:       return "NVPTXISD::Suld2DV2I32Clamp";
    709   case NVPTXISD::Suld2DV2I64Clamp:       return "NVPTXISD::Suld2DV2I64Clamp";
    710   case NVPTXISD::Suld2DV4I8Clamp:        return "NVPTXISD::Suld2DV4I8Clamp";
    711   case NVPTXISD::Suld2DV4I16Clamp:       return "NVPTXISD::Suld2DV4I16Clamp";
    712   case NVPTXISD::Suld2DV4I32Clamp:       return "NVPTXISD::Suld2DV4I32Clamp";
    713 
    714   case NVPTXISD::Suld2DArrayI8Clamp:   return "NVPTXISD::Suld2DArrayI8Clamp";
    715   case NVPTXISD::Suld2DArrayI16Clamp:  return "NVPTXISD::Suld2DArrayI16Clamp";
    716   case NVPTXISD::Suld2DArrayI32Clamp:  return "NVPTXISD::Suld2DArrayI32Clamp";
    717   case NVPTXISD::Suld2DArrayI64Clamp:  return "NVPTXISD::Suld2DArrayI64Clamp";
    718   case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
    719   case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
    720   case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
    721   case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
    722   case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
    723   case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
    724   case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
    725 
    726   case NVPTXISD::Suld3DI8Clamp:          return "NVPTXISD::Suld3DI8Clamp";
    727   case NVPTXISD::Suld3DI16Clamp:         return "NVPTXISD::Suld3DI16Clamp";
    728   case NVPTXISD::Suld3DI32Clamp:         return "NVPTXISD::Suld3DI32Clamp";
    729   case NVPTXISD::Suld3DI64Clamp:         return "NVPTXISD::Suld3DI64Clamp";
    730   case NVPTXISD::Suld3DV2I8Clamp:        return "NVPTXISD::Suld3DV2I8Clamp";
    731   case NVPTXISD::Suld3DV2I16Clamp:       return "NVPTXISD::Suld3DV2I16Clamp";
    732   case NVPTXISD::Suld3DV2I32Clamp:       return "NVPTXISD::Suld3DV2I32Clamp";
    733   case NVPTXISD::Suld3DV2I64Clamp:       return "NVPTXISD::Suld3DV2I64Clamp";
    734   case NVPTXISD::Suld3DV4I8Clamp:        return "NVPTXISD::Suld3DV4I8Clamp";
    735   case NVPTXISD::Suld3DV4I16Clamp:       return "NVPTXISD::Suld3DV4I16Clamp";
    736   case NVPTXISD::Suld3DV4I32Clamp:       return "NVPTXISD::Suld3DV4I32Clamp";
    737 
    738   case NVPTXISD::Suld1DI8Trap:          return "NVPTXISD::Suld1DI8Trap";
    739   case NVPTXISD::Suld1DI16Trap:         return "NVPTXISD::Suld1DI16Trap";
    740   case NVPTXISD::Suld1DI32Trap:         return "NVPTXISD::Suld1DI32Trap";
    741   case NVPTXISD::Suld1DI64Trap:         return "NVPTXISD::Suld1DI64Trap";
    742   case NVPTXISD::Suld1DV2I8Trap:        return "NVPTXISD::Suld1DV2I8Trap";
    743   case NVPTXISD::Suld1DV2I16Trap:       return "NVPTXISD::Suld1DV2I16Trap";
    744   case NVPTXISD::Suld1DV2I32Trap:       return "NVPTXISD::Suld1DV2I32Trap";
    745   case NVPTXISD::Suld1DV2I64Trap:       return "NVPTXISD::Suld1DV2I64Trap";
    746   case NVPTXISD::Suld1DV4I8Trap:        return "NVPTXISD::Suld1DV4I8Trap";
    747   case NVPTXISD::Suld1DV4I16Trap:       return "NVPTXISD::Suld1DV4I16Trap";
    748   case NVPTXISD::Suld1DV4I32Trap:       return "NVPTXISD::Suld1DV4I32Trap";
    749 
    750   case NVPTXISD::Suld1DArrayI8Trap:     return "NVPTXISD::Suld1DArrayI8Trap";
    751   case NVPTXISD::Suld1DArrayI16Trap:    return "NVPTXISD::Suld1DArrayI16Trap";
    752   case NVPTXISD::Suld1DArrayI32Trap:    return "NVPTXISD::Suld1DArrayI32Trap";
    753   case NVPTXISD::Suld1DArrayI64Trap:    return "NVPTXISD::Suld1DArrayI64Trap";
    754   case NVPTXISD::Suld1DArrayV2I8Trap:   return "NVPTXISD::Suld1DArrayV2I8Trap";
    755   case NVPTXISD::Suld1DArrayV2I16Trap:  return "NVPTXISD::Suld1DArrayV2I16Trap";
    756   case NVPTXISD::Suld1DArrayV2I32Trap:  return "NVPTXISD::Suld1DArrayV2I32Trap";
    757   case NVPTXISD::Suld1DArrayV2I64Trap:  return "NVPTXISD::Suld1DArrayV2I64Trap";
    758   case NVPTXISD::Suld1DArrayV4I8Trap:   return "NVPTXISD::Suld1DArrayV4I8Trap";
    759   case NVPTXISD::Suld1DArrayV4I16Trap:  return "NVPTXISD::Suld1DArrayV4I16Trap";
    760   case NVPTXISD::Suld1DArrayV4I32Trap:  return "NVPTXISD::Suld1DArrayV4I32Trap";
    761 
    762   case NVPTXISD::Suld2DI8Trap:          return "NVPTXISD::Suld2DI8Trap";
    763   case NVPTXISD::Suld2DI16Trap:         return "NVPTXISD::Suld2DI16Trap";
    764   case NVPTXISD::Suld2DI32Trap:         return "NVPTXISD::Suld2DI32Trap";
    765   case NVPTXISD::Suld2DI64Trap:         return "NVPTXISD::Suld2DI64Trap";
    766   case NVPTXISD::Suld2DV2I8Trap:        return "NVPTXISD::Suld2DV2I8Trap";
    767   case NVPTXISD::Suld2DV2I16Trap:       return "NVPTXISD::Suld2DV2I16Trap";
    768   case NVPTXISD::Suld2DV2I32Trap:       return "NVPTXISD::Suld2DV2I32Trap";
    769   case NVPTXISD::Suld2DV2I64Trap:       return "NVPTXISD::Suld2DV2I64Trap";
    770   case NVPTXISD::Suld2DV4I8Trap:        return "NVPTXISD::Suld2DV4I8Trap";
    771   case NVPTXISD::Suld2DV4I16Trap:       return "NVPTXISD::Suld2DV4I16Trap";
    772   case NVPTXISD::Suld2DV4I32Trap:       return "NVPTXISD::Suld2DV4I32Trap";
    773 
    774   case NVPTXISD::Suld2DArrayI8Trap:     return "NVPTXISD::Suld2DArrayI8Trap";
    775   case NVPTXISD::Suld2DArrayI16Trap:    return "NVPTXISD::Suld2DArrayI16Trap";
    776   case NVPTXISD::Suld2DArrayI32Trap:    return "NVPTXISD::Suld2DArrayI32Trap";
    777   case NVPTXISD::Suld2DArrayI64Trap:    return "NVPTXISD::Suld2DArrayI64Trap";
    778   case NVPTXISD::Suld2DArrayV2I8Trap:   return "NVPTXISD::Suld2DArrayV2I8Trap";
    779   case NVPTXISD::Suld2DArrayV2I16Trap:  return "NVPTXISD::Suld2DArrayV2I16Trap";
    780   case NVPTXISD::Suld2DArrayV2I32Trap:  return "NVPTXISD::Suld2DArrayV2I32Trap";
    781   case NVPTXISD::Suld2DArrayV2I64Trap:  return "NVPTXISD::Suld2DArrayV2I64Trap";
    782   case NVPTXISD::Suld2DArrayV4I8Trap:   return "NVPTXISD::Suld2DArrayV4I8Trap";
    783   case NVPTXISD::Suld2DArrayV4I16Trap:  return "NVPTXISD::Suld2DArrayV4I16Trap";
    784   case NVPTXISD::Suld2DArrayV4I32Trap:  return "NVPTXISD::Suld2DArrayV4I32Trap";
    785 
    786   case NVPTXISD::Suld3DI8Trap:          return "NVPTXISD::Suld3DI8Trap";
    787   case NVPTXISD::Suld3DI16Trap:         return "NVPTXISD::Suld3DI16Trap";
    788   case NVPTXISD::Suld3DI32Trap:         return "NVPTXISD::Suld3DI32Trap";
    789   case NVPTXISD::Suld3DI64Trap:         return "NVPTXISD::Suld3DI64Trap";
    790   case NVPTXISD::Suld3DV2I8Trap:        return "NVPTXISD::Suld3DV2I8Trap";
    791   case NVPTXISD::Suld3DV2I16Trap:       return "NVPTXISD::Suld3DV2I16Trap";
    792   case NVPTXISD::Suld3DV2I32Trap:       return "NVPTXISD::Suld3DV2I32Trap";
    793   case NVPTXISD::Suld3DV2I64Trap:       return "NVPTXISD::Suld3DV2I64Trap";
    794   case NVPTXISD::Suld3DV4I8Trap:        return "NVPTXISD::Suld3DV4I8Trap";
    795   case NVPTXISD::Suld3DV4I16Trap:       return "NVPTXISD::Suld3DV4I16Trap";
    796   case NVPTXISD::Suld3DV4I32Trap:       return "NVPTXISD::Suld3DV4I32Trap";
    797 
    798   case NVPTXISD::Suld1DI8Zero:          return "NVPTXISD::Suld1DI8Zero";
    799   case NVPTXISD::Suld1DI16Zero:         return "NVPTXISD::Suld1DI16Zero";
    800   case NVPTXISD::Suld1DI32Zero:         return "NVPTXISD::Suld1DI32Zero";
    801   case NVPTXISD::Suld1DI64Zero:         return "NVPTXISD::Suld1DI64Zero";
    802   case NVPTXISD::Suld1DV2I8Zero:        return "NVPTXISD::Suld1DV2I8Zero";
    803   case NVPTXISD::Suld1DV2I16Zero:       return "NVPTXISD::Suld1DV2I16Zero";
    804   case NVPTXISD::Suld1DV2I32Zero:       return "NVPTXISD::Suld1DV2I32Zero";
    805   case NVPTXISD::Suld1DV2I64Zero:       return "NVPTXISD::Suld1DV2I64Zero";
    806   case NVPTXISD::Suld1DV4I8Zero:        return "NVPTXISD::Suld1DV4I8Zero";
    807   case NVPTXISD::Suld1DV4I16Zero:       return "NVPTXISD::Suld1DV4I16Zero";
    808   case NVPTXISD::Suld1DV4I32Zero:       return "NVPTXISD::Suld1DV4I32Zero";
    809 
    810   case NVPTXISD::Suld1DArrayI8Zero:     return "NVPTXISD::Suld1DArrayI8Zero";
    811   case NVPTXISD::Suld1DArrayI16Zero:    return "NVPTXISD::Suld1DArrayI16Zero";
    812   case NVPTXISD::Suld1DArrayI32Zero:    return "NVPTXISD::Suld1DArrayI32Zero";
    813   case NVPTXISD::Suld1DArrayI64Zero:    return "NVPTXISD::Suld1DArrayI64Zero";
    814   case NVPTXISD::Suld1DArrayV2I8Zero:   return "NVPTXISD::Suld1DArrayV2I8Zero";
    815   case NVPTXISD::Suld1DArrayV2I16Zero:  return "NVPTXISD::Suld1DArrayV2I16Zero";
    816   case NVPTXISD::Suld1DArrayV2I32Zero:  return "NVPTXISD::Suld1DArrayV2I32Zero";
    817   case NVPTXISD::Suld1DArrayV2I64Zero:  return "NVPTXISD::Suld1DArrayV2I64Zero";
    818   case NVPTXISD::Suld1DArrayV4I8Zero:   return "NVPTXISD::Suld1DArrayV4I8Zero";
    819   case NVPTXISD::Suld1DArrayV4I16Zero:  return "NVPTXISD::Suld1DArrayV4I16Zero";
    820   case NVPTXISD::Suld1DArrayV4I32Zero:  return "NVPTXISD::Suld1DArrayV4I32Zero";
    821 
    822   case NVPTXISD::Suld2DI8Zero:          return "NVPTXISD::Suld2DI8Zero";
    823   case NVPTXISD::Suld2DI16Zero:         return "NVPTXISD::Suld2DI16Zero";
    824   case NVPTXISD::Suld2DI32Zero:         return "NVPTXISD::Suld2DI32Zero";
    825   case NVPTXISD::Suld2DI64Zero:         return "NVPTXISD::Suld2DI64Zero";
    826   case NVPTXISD::Suld2DV2I8Zero:        return "NVPTXISD::Suld2DV2I8Zero";
    827   case NVPTXISD::Suld2DV2I16Zero:       return "NVPTXISD::Suld2DV2I16Zero";
    828   case NVPTXISD::Suld2DV2I32Zero:       return "NVPTXISD::Suld2DV2I32Zero";
    829   case NVPTXISD::Suld2DV2I64Zero:       return "NVPTXISD::Suld2DV2I64Zero";
    830   case NVPTXISD::Suld2DV4I8Zero:        return "NVPTXISD::Suld2DV4I8Zero";
    831   case NVPTXISD::Suld2DV4I16Zero:       return "NVPTXISD::Suld2DV4I16Zero";
    832   case NVPTXISD::Suld2DV4I32Zero:       return "NVPTXISD::Suld2DV4I32Zero";
    833 
    834   case NVPTXISD::Suld2DArrayI8Zero:     return "NVPTXISD::Suld2DArrayI8Zero";
    835   case NVPTXISD::Suld2DArrayI16Zero:    return "NVPTXISD::Suld2DArrayI16Zero";
    836   case NVPTXISD::Suld2DArrayI32Zero:    return "NVPTXISD::Suld2DArrayI32Zero";
    837   case NVPTXISD::Suld2DArrayI64Zero:    return "NVPTXISD::Suld2DArrayI64Zero";
    838   case NVPTXISD::Suld2DArrayV2I8Zero:   return "NVPTXISD::Suld2DArrayV2I8Zero";
    839   case NVPTXISD::Suld2DArrayV2I16Zero:  return "NVPTXISD::Suld2DArrayV2I16Zero";
    840   case NVPTXISD::Suld2DArrayV2I32Zero:  return "NVPTXISD::Suld2DArrayV2I32Zero";
    841   case NVPTXISD::Suld2DArrayV2I64Zero:  return "NVPTXISD::Suld2DArrayV2I64Zero";
    842   case NVPTXISD::Suld2DArrayV4I8Zero:   return "NVPTXISD::Suld2DArrayV4I8Zero";
    843   case NVPTXISD::Suld2DArrayV4I16Zero:  return "NVPTXISD::Suld2DArrayV4I16Zero";
    844   case NVPTXISD::Suld2DArrayV4I32Zero:  return "NVPTXISD::Suld2DArrayV4I32Zero";
    845 
    846   case NVPTXISD::Suld3DI8Zero:          return "NVPTXISD::Suld3DI8Zero";
    847   case NVPTXISD::Suld3DI16Zero:         return "NVPTXISD::Suld3DI16Zero";
    848   case NVPTXISD::Suld3DI32Zero:         return "NVPTXISD::Suld3DI32Zero";
    849   case NVPTXISD::Suld3DI64Zero:         return "NVPTXISD::Suld3DI64Zero";
    850   case NVPTXISD::Suld3DV2I8Zero:        return "NVPTXISD::Suld3DV2I8Zero";
    851   case NVPTXISD::Suld3DV2I16Zero:       return "NVPTXISD::Suld3DV2I16Zero";
    852   case NVPTXISD::Suld3DV2I32Zero:       return "NVPTXISD::Suld3DV2I32Zero";
    853   case NVPTXISD::Suld3DV2I64Zero:       return "NVPTXISD::Suld3DV2I64Zero";
    854   case NVPTXISD::Suld3DV4I8Zero:        return "NVPTXISD::Suld3DV4I8Zero";
    855   case NVPTXISD::Suld3DV4I16Zero:       return "NVPTXISD::Suld3DV4I16Zero";
    856   case NVPTXISD::Suld3DV4I32Zero:       return "NVPTXISD::Suld3DV4I32Zero";
    857   }
    858 }
    859 
    860 TargetLoweringBase::LegalizeTypeAction
    861 NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const {
    862   if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
    863     return TypeSplitVector;
    864 
    865   return TargetLoweringBase::getPreferredVectorAction(VT);
    866 }
    867 
    868 SDValue
    869 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
    870   SDLoc dl(Op);
    871   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
    872   Op = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
    873   return DAG.getNode(NVPTXISD::Wrapper, dl, getPointerTy(), Op);
    874 }
    875 
    876 std::string
    877 NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args,
    878                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
    879                                   unsigned retAlignment,
    880                                   const ImmutableCallSite *CS) const {
    881 
    882   bool isABI = (STI.getSmVersion() >= 20);
    883   assert(isABI && "Non-ABI compilation is not supported");
    884   if (!isABI)
    885     return "";
    886 
    887   std::stringstream O;
    888   O << "prototype_" << uniqueCallSite << " : .callprototype ";
    889 
    890   if (retTy->getTypeID() == Type::VoidTyID) {
    891     O << "()";
    892   } else {
    893     O << "(";
    894     if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) {
    895       unsigned size = 0;
    896       if (const IntegerType *ITy = dyn_cast<IntegerType>(retTy)) {
    897         size = ITy->getBitWidth();
    898         if (size < 32)
    899           size = 32;
    900       } else {
    901         assert(retTy->isFloatingPointTy() &&
    902                "Floating point type expected here");
    903         size = retTy->getPrimitiveSizeInBits();
    904       }
    905 
    906       O << ".param .b" << size << " _";
    907     } else if (isa<PointerType>(retTy)) {
    908       O << ".param .b" << getPointerTy().getSizeInBits() << " _";
    909     } else if ((retTy->getTypeID() == Type::StructTyID) ||
    910                isa<VectorType>(retTy)) {
    911       O << ".param .align "
    912         << retAlignment
    913         << " .b8 _["
    914         << getDataLayout()->getTypeAllocSize(retTy) << "]";
    915     } else {
    916       llvm_unreachable("Unknown return type");
    917     }
    918     O << ") ";
    919   }
    920   O << "_ (";
    921 
    922   bool first = true;
    923   MVT thePointerTy = getPointerTy();
    924 
    925   unsigned OIdx = 0;
    926   for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
    927     Type *Ty = Args[i].Ty;
    928     if (!first) {
    929       O << ", ";
    930     }
    931     first = false;
    932 
    933     if (!Outs[OIdx].Flags.isByVal()) {
    934       if (Ty->isAggregateType() || Ty->isVectorTy()) {
    935         unsigned align = 0;
    936         const CallInst *CallI = cast<CallInst>(CS->getInstruction());
    937         const DataLayout *TD = getDataLayout();
    938         // +1 because index 0 is reserved for return type alignment
    939         if (!llvm::getAlign(*CallI, i + 1, align))
    940           align = TD->getABITypeAlignment(Ty);
    941         unsigned sz = TD->getTypeAllocSize(Ty);
    942         O << ".param .align " << align << " .b8 ";
    943         O << "_";
    944         O << "[" << sz << "]";
    945         // update the index for Outs
    946         SmallVector<EVT, 16> vtparts;
    947         ComputeValueVTs(*this, Ty, vtparts);
    948         if (unsigned len = vtparts.size())
    949           OIdx += len - 1;
    950         continue;
    951       }
    952        // i8 types in IR will be i16 types in SDAG
    953       assert((getValueType(Ty) == Outs[OIdx].VT ||
    954              (getValueType(Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
    955              "type mismatch between callee prototype and arguments");
    956       // scalar type
    957       unsigned sz = 0;
    958       if (isa<IntegerType>(Ty)) {
    959         sz = cast<IntegerType>(Ty)->getBitWidth();
    960         if (sz < 32)
    961           sz = 32;
    962       } else if (isa<PointerType>(Ty))
    963         sz = thePointerTy.getSizeInBits();
    964       else
    965         sz = Ty->getPrimitiveSizeInBits();
    966       O << ".param .b" << sz << " ";
    967       O << "_";
    968       continue;
    969     }
    970     const PointerType *PTy = dyn_cast<PointerType>(Ty);
    971     assert(PTy && "Param with byval attribute should be a pointer type");
    972     Type *ETy = PTy->getElementType();
    973 
    974     unsigned align = Outs[OIdx].Flags.getByValAlign();
    975     unsigned sz = getDataLayout()->getTypeAllocSize(ETy);
    976     O << ".param .align " << align << " .b8 ";
    977     O << "_";
    978     O << "[" << sz << "]";
    979   }
    980   O << ");";
    981   return O.str();
    982 }
    983 
    984 unsigned
    985 NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
    986                                           const ImmutableCallSite *CS,
    987                                           Type *Ty,
    988                                           unsigned Idx) const {
    989   const DataLayout *TD = getDataLayout();
    990   unsigned Align = 0;
    991   const Value *DirectCallee = CS->getCalledFunction();
    992 
    993   if (!DirectCallee) {
    994     // We don't have a direct function symbol, but that may be because of
    995     // constant cast instructions in the call.
    996     const Instruction *CalleeI = CS->getInstruction();
    997     assert(CalleeI && "Call target is not a function or derived value?");
    998 
    999     // With bitcast'd call targets, the instruction will be the call
   1000     if (isa<CallInst>(CalleeI)) {
   1001       // Check if we have call alignment metadata
   1002       if (llvm::getAlign(*cast<CallInst>(CalleeI), Idx, Align))
   1003         return Align;
   1004 
   1005       const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
   1006       // Ignore any bitcast instructions
   1007       while(isa<ConstantExpr>(CalleeV)) {
   1008         const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
   1009         if (!CE->isCast())
   1010           break;
   1011         // Look through the bitcast
   1012         CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
   1013       }
   1014 
   1015       // We have now looked past all of the bitcasts.  Do we finally have a
   1016       // Function?
   1017       if (isa<Function>(CalleeV))
   1018         DirectCallee = CalleeV;
   1019     }
   1020   }
   1021 
   1022   // Check for function alignment information if we found that the
   1023   // ultimate target is a Function
   1024   if (DirectCallee)
   1025     if (llvm::getAlign(*cast<Function>(DirectCallee), Idx, Align))
   1026       return Align;
   1027 
   1028   // Call is indirect or alignment information is not available, fall back to
   1029   // the ABI type alignment
   1030   return TD->getABITypeAlignment(Ty);
   1031 }
   1032 
   1033 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   1034                                        SmallVectorImpl<SDValue> &InVals) const {
   1035   SelectionDAG &DAG = CLI.DAG;
   1036   SDLoc dl = CLI.DL;
   1037   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
   1038   SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
   1039   SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
   1040   SDValue Chain = CLI.Chain;
   1041   SDValue Callee = CLI.Callee;
   1042   bool &isTailCall = CLI.IsTailCall;
   1043   ArgListTy &Args = CLI.getArgs();
   1044   Type *retTy = CLI.RetTy;
   1045   ImmutableCallSite *CS = CLI.CS;
   1046 
   1047   bool isABI = (STI.getSmVersion() >= 20);
   1048   assert(isABI && "Non-ABI compilation is not supported");
   1049   if (!isABI)
   1050     return Chain;
   1051   const DataLayout *TD = getDataLayout();
   1052   MachineFunction &MF = DAG.getMachineFunction();
   1053   const Function *F = MF.getFunction();
   1054 
   1055   SDValue tempChain = Chain;
   1056   Chain =
   1057       DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(uniqueCallSite, true),
   1058                            dl);
   1059   SDValue InFlag = Chain.getValue(1);
   1060 
   1061   unsigned paramCount = 0;
   1062   // Args.size() and Outs.size() need not match.
   1063   // Outs.size() will be larger
   1064   //   * if there is an aggregate argument with multiple fields (each field
   1065   //     showing up separately in Outs)
   1066   //   * if there is a vector argument with more than typical vector-length
   1067   //     elements (generally if more than 4) where each vector element is
   1068   //     individually present in Outs.
   1069   // So a different index should be used for indexing into Outs/OutVals.
   1070   // See similar issue in LowerFormalArguments.
   1071   unsigned OIdx = 0;
   1072   // Declare the .params or .reg need to pass values
   1073   // to the function
   1074   for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
   1075     EVT VT = Outs[OIdx].VT;
   1076     Type *Ty = Args[i].Ty;
   1077 
   1078     if (!Outs[OIdx].Flags.isByVal()) {
   1079       if (Ty->isAggregateType()) {
   1080         // aggregate
   1081         SmallVector<EVT, 16> vtparts;
   1082         SmallVector<uint64_t, 16> Offsets;
   1083         ComputePTXValueVTs(*this, Ty, vtparts, &Offsets, 0);
   1084 
   1085         unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
   1086         // declare .param .align <align> .b8 .param<n>[<size>];
   1087         unsigned sz = TD->getTypeAllocSize(Ty);
   1088         SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1089         SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, MVT::i32),
   1090                                       DAG.getConstant(paramCount, MVT::i32),
   1091                                       DAG.getConstant(sz, MVT::i32), InFlag };
   1092         Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
   1093                             DeclareParamOps);
   1094         InFlag = Chain.getValue(1);
   1095         for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
   1096           EVT elemtype = vtparts[j];
   1097           unsigned ArgAlign = GreatestCommonDivisor64(align, Offsets[j]);
   1098           if (elemtype.isInteger() && (sz < 8))
   1099             sz = 8;
   1100           SDValue StVal = OutVals[OIdx];
   1101           if (elemtype.getSizeInBits() < 16) {
   1102             StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
   1103           }
   1104           SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1105           SDValue CopyParamOps[] = { Chain,
   1106                                      DAG.getConstant(paramCount, MVT::i32),
   1107                                      DAG.getConstant(Offsets[j], MVT::i32),
   1108                                      StVal, InFlag };
   1109           Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
   1110                                           CopyParamVTs, CopyParamOps,
   1111                                           elemtype, MachinePointerInfo(),
   1112                                           ArgAlign);
   1113           InFlag = Chain.getValue(1);
   1114           ++OIdx;
   1115         }
   1116         if (vtparts.size() > 0)
   1117           --OIdx;
   1118         ++paramCount;
   1119         continue;
   1120       }
   1121       if (Ty->isVectorTy()) {
   1122         EVT ObjectVT = getValueType(Ty);
   1123         unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
   1124         // declare .param .align <align> .b8 .param<n>[<size>];
   1125         unsigned sz = TD->getTypeAllocSize(Ty);
   1126         SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1127         SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, MVT::i32),
   1128                                       DAG.getConstant(paramCount, MVT::i32),
   1129                                       DAG.getConstant(sz, MVT::i32), InFlag };
   1130         Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
   1131                             DeclareParamOps);
   1132         InFlag = Chain.getValue(1);
   1133         unsigned NumElts = ObjectVT.getVectorNumElements();
   1134         EVT EltVT = ObjectVT.getVectorElementType();
   1135         EVT MemVT = EltVT;
   1136         bool NeedExtend = false;
   1137         if (EltVT.getSizeInBits() < 16) {
   1138           NeedExtend = true;
   1139           EltVT = MVT::i16;
   1140         }
   1141 
   1142         // V1 store
   1143         if (NumElts == 1) {
   1144           SDValue Elt = OutVals[OIdx++];
   1145           if (NeedExtend)
   1146             Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt);
   1147 
   1148           SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1149           SDValue CopyParamOps[] = { Chain,
   1150                                      DAG.getConstant(paramCount, MVT::i32),
   1151                                      DAG.getConstant(0, MVT::i32), Elt,
   1152                                      InFlag };
   1153           Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
   1154                                           CopyParamVTs, CopyParamOps,
   1155                                           MemVT, MachinePointerInfo());
   1156           InFlag = Chain.getValue(1);
   1157         } else if (NumElts == 2) {
   1158           SDValue Elt0 = OutVals[OIdx++];
   1159           SDValue Elt1 = OutVals[OIdx++];
   1160           if (NeedExtend) {
   1161             Elt0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt0);
   1162             Elt1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt1);
   1163           }
   1164 
   1165           SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1166           SDValue CopyParamOps[] = { Chain,
   1167                                      DAG.getConstant(paramCount, MVT::i32),
   1168                                      DAG.getConstant(0, MVT::i32), Elt0, Elt1,
   1169                                      InFlag };
   1170           Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParamV2, dl,
   1171                                           CopyParamVTs, CopyParamOps,
   1172                                           MemVT, MachinePointerInfo());
   1173           InFlag = Chain.getValue(1);
   1174         } else {
   1175           unsigned curOffset = 0;
   1176           // V4 stores
   1177           // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
   1178           // the
   1179           // vector will be expanded to a power of 2 elements, so we know we can
   1180           // always round up to the next multiple of 4 when creating the vector
   1181           // stores.
   1182           // e.g.  4 elem => 1 st.v4
   1183           //       6 elem => 2 st.v4
   1184           //       8 elem => 2 st.v4
   1185           //      11 elem => 3 st.v4
   1186           unsigned VecSize = 4;
   1187           if (EltVT.getSizeInBits() == 64)
   1188             VecSize = 2;
   1189 
   1190           // This is potentially only part of a vector, so assume all elements
   1191           // are packed together.
   1192           unsigned PerStoreOffset = MemVT.getStoreSizeInBits() / 8 * VecSize;
   1193 
   1194           for (unsigned i = 0; i < NumElts; i += VecSize) {
   1195             // Get values
   1196             SDValue StoreVal;
   1197             SmallVector<SDValue, 8> Ops;
   1198             Ops.push_back(Chain);
   1199             Ops.push_back(DAG.getConstant(paramCount, MVT::i32));
   1200             Ops.push_back(DAG.getConstant(curOffset, MVT::i32));
   1201 
   1202             unsigned Opc = NVPTXISD::StoreParamV2;
   1203 
   1204             StoreVal = OutVals[OIdx++];
   1205             if (NeedExtend)
   1206               StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
   1207             Ops.push_back(StoreVal);
   1208 
   1209             if (i + 1 < NumElts) {
   1210               StoreVal = OutVals[OIdx++];
   1211               if (NeedExtend)
   1212                 StoreVal =
   1213                     DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
   1214             } else {
   1215               StoreVal = DAG.getUNDEF(EltVT);
   1216             }
   1217             Ops.push_back(StoreVal);
   1218 
   1219             if (VecSize == 4) {
   1220               Opc = NVPTXISD::StoreParamV4;
   1221               if (i + 2 < NumElts) {
   1222                 StoreVal = OutVals[OIdx++];
   1223                 if (NeedExtend)
   1224                   StoreVal =
   1225                       DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
   1226               } else {
   1227                 StoreVal = DAG.getUNDEF(EltVT);
   1228               }
   1229               Ops.push_back(StoreVal);
   1230 
   1231               if (i + 3 < NumElts) {
   1232                 StoreVal = OutVals[OIdx++];
   1233                 if (NeedExtend)
   1234                   StoreVal =
   1235                       DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
   1236               } else {
   1237                 StoreVal = DAG.getUNDEF(EltVT);
   1238               }
   1239               Ops.push_back(StoreVal);
   1240             }
   1241 
   1242             Ops.push_back(InFlag);
   1243 
   1244             SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1245             Chain = DAG.getMemIntrinsicNode(Opc, dl, CopyParamVTs, Ops,
   1246                                             MemVT, MachinePointerInfo());
   1247             InFlag = Chain.getValue(1);
   1248             curOffset += PerStoreOffset;
   1249           }
   1250         }
   1251         ++paramCount;
   1252         --OIdx;
   1253         continue;
   1254       }
   1255       // Plain scalar
   1256       // for ABI,    declare .param .b<size> .param<n>;
   1257       unsigned sz = VT.getSizeInBits();
   1258       bool needExtend = false;
   1259       if (VT.isInteger()) {
   1260         if (sz < 16)
   1261           needExtend = true;
   1262         if (sz < 32)
   1263           sz = 32;
   1264       }
   1265       SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1266       SDValue DeclareParamOps[] = { Chain,
   1267                                     DAG.getConstant(paramCount, MVT::i32),
   1268                                     DAG.getConstant(sz, MVT::i32),
   1269                                     DAG.getConstant(0, MVT::i32), InFlag };
   1270       Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
   1271                           DeclareParamOps);
   1272       InFlag = Chain.getValue(1);
   1273       SDValue OutV = OutVals[OIdx];
   1274       if (needExtend) {
   1275         // zext/sext i1 to i16
   1276         unsigned opc = ISD::ZERO_EXTEND;
   1277         if (Outs[OIdx].Flags.isSExt())
   1278           opc = ISD::SIGN_EXTEND;
   1279         OutV = DAG.getNode(opc, dl, MVT::i16, OutV);
   1280       }
   1281       SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1282       SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32),
   1283                                  DAG.getConstant(0, MVT::i32), OutV, InFlag };
   1284 
   1285       unsigned opcode = NVPTXISD::StoreParam;
   1286       if (Outs[OIdx].Flags.isZExt())
   1287         opcode = NVPTXISD::StoreParamU32;
   1288       else if (Outs[OIdx].Flags.isSExt())
   1289         opcode = NVPTXISD::StoreParamS32;
   1290       Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps,
   1291                                       VT, MachinePointerInfo());
   1292 
   1293       InFlag = Chain.getValue(1);
   1294       ++paramCount;
   1295       continue;
   1296     }
   1297     // struct or vector
   1298     SmallVector<EVT, 16> vtparts;
   1299     SmallVector<uint64_t, 16> Offsets;
   1300     const PointerType *PTy = dyn_cast<PointerType>(Args[i].Ty);
   1301     assert(PTy && "Type of a byval parameter should be pointer");
   1302     ComputePTXValueVTs(*this, PTy->getElementType(), vtparts, &Offsets, 0);
   1303 
   1304     // declare .param .align <align> .b8 .param<n>[<size>];
   1305     unsigned sz = Outs[OIdx].Flags.getByValSize();
   1306     SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1307     unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
   1308     // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
   1309     // so we don't need to worry about natural alignment or not.
   1310     // See TargetLowering::LowerCallTo().
   1311     SDValue DeclareParamOps[] = {
   1312       Chain, DAG.getConstant(Outs[OIdx].Flags.getByValAlign(), MVT::i32),
   1313       DAG.getConstant(paramCount, MVT::i32), DAG.getConstant(sz, MVT::i32),
   1314       InFlag
   1315     };
   1316     Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
   1317                         DeclareParamOps);
   1318     InFlag = Chain.getValue(1);
   1319     for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
   1320       EVT elemtype = vtparts[j];
   1321       int curOffset = Offsets[j];
   1322       unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
   1323       SDValue srcAddr =
   1324           DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[OIdx],
   1325                       DAG.getConstant(curOffset, getPointerTy()));
   1326       SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
   1327                                    MachinePointerInfo(), false, false, false,
   1328                                    PartAlign);
   1329       if (elemtype.getSizeInBits() < 16) {
   1330         theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
   1331       }
   1332       SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1333       SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32),
   1334                                  DAG.getConstant(curOffset, MVT::i32), theVal,
   1335                                  InFlag };
   1336       Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
   1337                                       CopyParamOps, elemtype,
   1338                                       MachinePointerInfo());
   1339 
   1340       InFlag = Chain.getValue(1);
   1341     }
   1342     ++paramCount;
   1343   }
   1344 
   1345   GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
   1346   unsigned retAlignment = 0;
   1347 
   1348   // Handle Result
   1349   if (Ins.size() > 0) {
   1350     SmallVector<EVT, 16> resvtparts;
   1351     ComputeValueVTs(*this, retTy, resvtparts);
   1352 
   1353     // Declare
   1354     //  .param .align 16 .b8 retval0[<size-in-bytes>], or
   1355     //  .param .b<size-in-bits> retval0
   1356     unsigned resultsz = TD->getTypeAllocSizeInBits(retTy);
   1357     // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
   1358     // these three types to match the logic in
   1359     // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
   1360     // Plus, this behavior is consistent with nvcc's.
   1361     if (retTy->isFloatingPointTy() || retTy->isIntegerTy() ||
   1362         retTy->isPointerTy()) {
   1363       // Scalar needs to be at least 32bit wide
   1364       if (resultsz < 32)
   1365         resultsz = 32;
   1366       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1367       SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, MVT::i32),
   1368                                   DAG.getConstant(resultsz, MVT::i32),
   1369                                   DAG.getConstant(0, MVT::i32), InFlag };
   1370       Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
   1371                           DeclareRetOps);
   1372       InFlag = Chain.getValue(1);
   1373     } else {
   1374       retAlignment = getArgumentAlignment(Callee, CS, retTy, 0);
   1375       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1376       SDValue DeclareRetOps[] = { Chain,
   1377                                   DAG.getConstant(retAlignment, MVT::i32),
   1378                                   DAG.getConstant(resultsz / 8, MVT::i32),
   1379                                   DAG.getConstant(0, MVT::i32), InFlag };
   1380       Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
   1381                           DeclareRetOps);
   1382       InFlag = Chain.getValue(1);
   1383     }
   1384   }
   1385 
   1386   if (!Func) {
   1387     // This is indirect function call case : PTX requires a prototype of the
   1388     // form
   1389     // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
   1390     // to be emitted, and the label has to used as the last arg of call
   1391     // instruction.
   1392     // The prototype is embedded in a string and put as the operand for a
   1393     // CallPrototype SDNode which will print out to the value of the string.
   1394     SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1395     std::string Proto = getPrototype(retTy, Args, Outs, retAlignment, CS);
   1396     const char *ProtoStr =
   1397       nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
   1398     SDValue ProtoOps[] = {
   1399       Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
   1400     };
   1401     Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
   1402     InFlag = Chain.getValue(1);
   1403   }
   1404   // Op to just print "call"
   1405   SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1406   SDValue PrintCallOps[] = {
   1407     Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, MVT::i32), InFlag
   1408   };
   1409   Chain = DAG.getNode(Func ? (NVPTXISD::PrintCallUni) : (NVPTXISD::PrintCall),
   1410                       dl, PrintCallVTs, PrintCallOps);
   1411   InFlag = Chain.getValue(1);
   1412 
   1413   // Ops to print out the function name
   1414   SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1415   SDValue CallVoidOps[] = { Chain, Callee, InFlag };
   1416   Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
   1417   InFlag = Chain.getValue(1);
   1418 
   1419   // Ops to print out the param list
   1420   SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1421   SDValue CallArgBeginOps[] = { Chain, InFlag };
   1422   Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
   1423                       CallArgBeginOps);
   1424   InFlag = Chain.getValue(1);
   1425 
   1426   for (unsigned i = 0, e = paramCount; i != e; ++i) {
   1427     unsigned opcode;
   1428     if (i == (e - 1))
   1429       opcode = NVPTXISD::LastCallArg;
   1430     else
   1431       opcode = NVPTXISD::CallArg;
   1432     SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1433     SDValue CallArgOps[] = { Chain, DAG.getConstant(1, MVT::i32),
   1434                              DAG.getConstant(i, MVT::i32), InFlag };
   1435     Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
   1436     InFlag = Chain.getValue(1);
   1437   }
   1438   SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1439   SDValue CallArgEndOps[] = { Chain, DAG.getConstant(Func ? 1 : 0, MVT::i32),
   1440                               InFlag };
   1441   Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
   1442   InFlag = Chain.getValue(1);
   1443 
   1444   if (!Func) {
   1445     SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1446     SDValue PrototypeOps[] = { Chain, DAG.getConstant(uniqueCallSite, MVT::i32),
   1447                                InFlag };
   1448     Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
   1449     InFlag = Chain.getValue(1);
   1450   }
   1451 
   1452   // Generate loads from param memory/moves from registers for result
   1453   if (Ins.size() > 0) {
   1454     if (retTy && retTy->isVectorTy()) {
   1455       EVT ObjectVT = getValueType(retTy);
   1456       unsigned NumElts = ObjectVT.getVectorNumElements();
   1457       EVT EltVT = ObjectVT.getVectorElementType();
   1458       assert(STI.getTargetLowering()->getNumRegisters(F->getContext(),
   1459                                                       ObjectVT) == NumElts &&
   1460              "Vector was not scalarized");
   1461       unsigned sz = EltVT.getSizeInBits();
   1462       bool needTruncate = sz < 8;
   1463 
   1464       if (NumElts == 1) {
   1465         // Just a simple load
   1466         SmallVector<EVT, 4> LoadRetVTs;
   1467         if (EltVT == MVT::i1 || EltVT == MVT::i8) {
   1468           // If loading i1/i8 result, generate
   1469           //   load.b8 i16
   1470           //   if i1
   1471           //   trunc i16 to i1
   1472           LoadRetVTs.push_back(MVT::i16);
   1473         } else
   1474           LoadRetVTs.push_back(EltVT);
   1475         LoadRetVTs.push_back(MVT::Other);
   1476         LoadRetVTs.push_back(MVT::Glue);
   1477         SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, MVT::i32),
   1478                                 DAG.getConstant(0, MVT::i32), InFlag};
   1479         SDValue retval = DAG.getMemIntrinsicNode(
   1480             NVPTXISD::LoadParam, dl,
   1481             DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
   1482         Chain = retval.getValue(1);
   1483         InFlag = retval.getValue(2);
   1484         SDValue Ret0 = retval;
   1485         if (needTruncate)
   1486           Ret0 = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Ret0);
   1487         InVals.push_back(Ret0);
   1488       } else if (NumElts == 2) {
   1489         // LoadV2
   1490         SmallVector<EVT, 4> LoadRetVTs;
   1491         if (EltVT == MVT::i1 || EltVT == MVT::i8) {
   1492           // If loading i1/i8 result, generate
   1493           //   load.b8 i16
   1494           //   if i1
   1495           //   trunc i16 to i1
   1496           LoadRetVTs.push_back(MVT::i16);
   1497           LoadRetVTs.push_back(MVT::i16);
   1498         } else {
   1499           LoadRetVTs.push_back(EltVT);
   1500           LoadRetVTs.push_back(EltVT);
   1501         }
   1502         LoadRetVTs.push_back(MVT::Other);
   1503         LoadRetVTs.push_back(MVT::Glue);
   1504         SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, MVT::i32),
   1505                                 DAG.getConstant(0, MVT::i32), InFlag};
   1506         SDValue retval = DAG.getMemIntrinsicNode(
   1507             NVPTXISD::LoadParamV2, dl,
   1508             DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
   1509         Chain = retval.getValue(2);
   1510         InFlag = retval.getValue(3);
   1511         SDValue Ret0 = retval.getValue(0);
   1512         SDValue Ret1 = retval.getValue(1);
   1513         if (needTruncate) {
   1514           Ret0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret0);
   1515           InVals.push_back(Ret0);
   1516           Ret1 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret1);
   1517           InVals.push_back(Ret1);
   1518         } else {
   1519           InVals.push_back(Ret0);
   1520           InVals.push_back(Ret1);
   1521         }
   1522       } else {
   1523         // Split into N LoadV4
   1524         unsigned Ofst = 0;
   1525         unsigned VecSize = 4;
   1526         unsigned Opc = NVPTXISD::LoadParamV4;
   1527         if (EltVT.getSizeInBits() == 64) {
   1528           VecSize = 2;
   1529           Opc = NVPTXISD::LoadParamV2;
   1530         }
   1531         EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
   1532         for (unsigned i = 0; i < NumElts; i += VecSize) {
   1533           SmallVector<EVT, 8> LoadRetVTs;
   1534           if (EltVT == MVT::i1 || EltVT == MVT::i8) {
   1535             // If loading i1/i8 result, generate
   1536             //   load.b8 i16
   1537             //   if i1
   1538             //   trunc i16 to i1
   1539             for (unsigned j = 0; j < VecSize; ++j)
   1540               LoadRetVTs.push_back(MVT::i16);
   1541           } else {
   1542             for (unsigned j = 0; j < VecSize; ++j)
   1543               LoadRetVTs.push_back(EltVT);
   1544           }
   1545           LoadRetVTs.push_back(MVT::Other);
   1546           LoadRetVTs.push_back(MVT::Glue);
   1547           SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, MVT::i32),
   1548                                   DAG.getConstant(Ofst, MVT::i32), InFlag};
   1549           SDValue retval = DAG.getMemIntrinsicNode(
   1550               Opc, dl, DAG.getVTList(LoadRetVTs),
   1551               LoadRetOps, EltVT, MachinePointerInfo());
   1552           if (VecSize == 2) {
   1553             Chain = retval.getValue(2);
   1554             InFlag = retval.getValue(3);
   1555           } else {
   1556             Chain = retval.getValue(4);
   1557             InFlag = retval.getValue(5);
   1558           }
   1559 
   1560           for (unsigned j = 0; j < VecSize; ++j) {
   1561             if (i + j >= NumElts)
   1562               break;
   1563             SDValue Elt = retval.getValue(j);
   1564             if (needTruncate)
   1565               Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
   1566             InVals.push_back(Elt);
   1567           }
   1568           Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
   1569         }
   1570       }
   1571     } else {
   1572       SmallVector<EVT, 16> VTs;
   1573       SmallVector<uint64_t, 16> Offsets;
   1574       ComputePTXValueVTs(*this, retTy, VTs, &Offsets, 0);
   1575       assert(VTs.size() == Ins.size() && "Bad value decomposition");
   1576       unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0);
   1577       for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
   1578         unsigned sz = VTs[i].getSizeInBits();
   1579         unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]);
   1580         bool needTruncate = sz < 8;
   1581         if (VTs[i].isInteger() && (sz < 8))
   1582           sz = 8;
   1583 
   1584         SmallVector<EVT, 4> LoadRetVTs;
   1585         EVT TheLoadType = VTs[i];
   1586         if (retTy->isIntegerTy() &&
   1587             TD->getTypeAllocSizeInBits(retTy) < 32) {
   1588           // This is for integer types only, and specifically not for
   1589           // aggregates.
   1590           LoadRetVTs.push_back(MVT::i32);
   1591           TheLoadType = MVT::i32;
   1592         } else if (sz < 16) {
   1593           // If loading i1/i8 result, generate
   1594           //   load i8 (-> i16)
   1595           //   trunc i16 to i1/i8
   1596           LoadRetVTs.push_back(MVT::i16);
   1597         } else
   1598           LoadRetVTs.push_back(Ins[i].VT);
   1599         LoadRetVTs.push_back(MVT::Other);
   1600         LoadRetVTs.push_back(MVT::Glue);
   1601 
   1602         SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, MVT::i32),
   1603                                 DAG.getConstant(Offsets[i], MVT::i32), InFlag};
   1604         SDValue retval = DAG.getMemIntrinsicNode(
   1605             NVPTXISD::LoadParam, dl,
   1606             DAG.getVTList(LoadRetVTs), LoadRetOps,
   1607             TheLoadType, MachinePointerInfo(), AlignI);
   1608         Chain = retval.getValue(1);
   1609         InFlag = retval.getValue(2);
   1610         SDValue Ret0 = retval.getValue(0);
   1611         if (needTruncate)
   1612           Ret0 = DAG.getNode(ISD::TRUNCATE, dl, Ins[i].VT, Ret0);
   1613         InVals.push_back(Ret0);
   1614       }
   1615     }
   1616   }
   1617 
   1618   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(uniqueCallSite, true),
   1619                              DAG.getIntPtrConstant(uniqueCallSite + 1, true),
   1620                              InFlag, dl);
   1621   uniqueCallSite++;
   1622 
   1623   // set isTailCall to false for now, until we figure out how to express
   1624   // tail call optimization in PTX
   1625   isTailCall = false;
   1626   return Chain;
   1627 }
   1628 
   1629 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
   1630 // (see LegalizeDAG.cpp). This is slow and uses local memory.
   1631 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
   1632 SDValue
   1633 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
   1634   SDNode *Node = Op.getNode();
   1635   SDLoc dl(Node);
   1636   SmallVector<SDValue, 8> Ops;
   1637   unsigned NumOperands = Node->getNumOperands();
   1638   for (unsigned i = 0; i < NumOperands; ++i) {
   1639     SDValue SubOp = Node->getOperand(i);
   1640     EVT VVT = SubOp.getNode()->getValueType(0);
   1641     EVT EltVT = VVT.getVectorElementType();
   1642     unsigned NumSubElem = VVT.getVectorNumElements();
   1643     for (unsigned j = 0; j < NumSubElem; ++j) {
   1644       Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
   1645                                 DAG.getIntPtrConstant(j)));
   1646     }
   1647   }
   1648   return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Ops);
   1649 }
   1650 
   1651 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
   1652 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
   1653 ///    amount, or
   1654 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
   1655 ///    amount.
   1656 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
   1657                                                   SelectionDAG &DAG) const {
   1658   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   1659   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
   1660 
   1661   EVT VT = Op.getValueType();
   1662   unsigned VTBits = VT.getSizeInBits();
   1663   SDLoc dl(Op);
   1664   SDValue ShOpLo = Op.getOperand(0);
   1665   SDValue ShOpHi = Op.getOperand(1);
   1666   SDValue ShAmt  = Op.getOperand(2);
   1667   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
   1668 
   1669   if (VTBits == 32 && STI.getSmVersion() >= 35) {
   1670 
   1671     // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
   1672     // {dHi, dLo} = {aHi, aLo} >> Amt
   1673     //   dHi = aHi >> Amt
   1674     //   dLo = shf.r.clamp aLo, aHi, Amt
   1675 
   1676     SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
   1677     SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
   1678                              ShAmt);
   1679 
   1680     SDValue Ops[2] = { Lo, Hi };
   1681     return DAG.getMergeValues(Ops, dl);
   1682   }
   1683   else {
   1684 
   1685     // {dHi, dLo} = {aHi, aLo} >> Amt
   1686     // - if (Amt>=size) then
   1687     //      dLo = aHi >> (Amt-size)
   1688     //      dHi = aHi >> Amt (this is either all 0 or all 1)
   1689     //   else
   1690     //      dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
   1691     //      dHi = aHi >> Amt
   1692 
   1693     SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
   1694                                    DAG.getConstant(VTBits, MVT::i32), ShAmt);
   1695     SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
   1696     SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
   1697                                      DAG.getConstant(VTBits, MVT::i32));
   1698     SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
   1699     SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
   1700     SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
   1701 
   1702     SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
   1703                                DAG.getConstant(VTBits, MVT::i32), ISD::SETGE);
   1704     SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
   1705     SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
   1706 
   1707     SDValue Ops[2] = { Lo, Hi };
   1708     return DAG.getMergeValues(Ops, dl);
   1709   }
   1710 }
   1711 
   1712 /// LowerShiftLeftParts - Lower SHL_PARTS, which
   1713 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
   1714 ///    amount, or
   1715 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
   1716 ///    amount.
   1717 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
   1718                                                  SelectionDAG &DAG) const {
   1719   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   1720   assert(Op.getOpcode() == ISD::SHL_PARTS);
   1721 
   1722   EVT VT = Op.getValueType();
   1723   unsigned VTBits = VT.getSizeInBits();
   1724   SDLoc dl(Op);
   1725   SDValue ShOpLo = Op.getOperand(0);
   1726   SDValue ShOpHi = Op.getOperand(1);
   1727   SDValue ShAmt  = Op.getOperand(2);
   1728 
   1729   if (VTBits == 32 && STI.getSmVersion() >= 35) {
   1730 
   1731     // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
   1732     // {dHi, dLo} = {aHi, aLo} << Amt
   1733     //   dHi = shf.l.clamp aLo, aHi, Amt
   1734     //   dLo = aLo << Amt
   1735 
   1736     SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
   1737                              ShAmt);
   1738     SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
   1739 
   1740     SDValue Ops[2] = { Lo, Hi };
   1741     return DAG.getMergeValues(Ops, dl);
   1742   }
   1743   else {
   1744 
   1745     // {dHi, dLo} = {aHi, aLo} << Amt
   1746     // - if (Amt>=size) then
   1747     //      dLo = aLo << Amt (all 0)
   1748     //      dLo = aLo << (Amt-size)
   1749     //   else
   1750     //      dLo = aLo << Amt
   1751     //      dHi = (aHi << Amt) | (aLo >> (size-Amt))
   1752 
   1753     SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
   1754                                    DAG.getConstant(VTBits, MVT::i32), ShAmt);
   1755     SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
   1756     SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
   1757                                      DAG.getConstant(VTBits, MVT::i32));
   1758     SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
   1759     SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
   1760     SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
   1761 
   1762     SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
   1763                                DAG.getConstant(VTBits, MVT::i32), ISD::SETGE);
   1764     SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
   1765     SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
   1766 
   1767     SDValue Ops[2] = { Lo, Hi };
   1768     return DAG.getMergeValues(Ops, dl);
   1769   }
   1770 }
   1771 
   1772 SDValue
   1773 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   1774   switch (Op.getOpcode()) {
   1775   case ISD::RETURNADDR:
   1776     return SDValue();
   1777   case ISD::FRAMEADDR:
   1778     return SDValue();
   1779   case ISD::GlobalAddress:
   1780     return LowerGlobalAddress(Op, DAG);
   1781   case ISD::INTRINSIC_W_CHAIN:
   1782     return Op;
   1783   case ISD::BUILD_VECTOR:
   1784   case ISD::EXTRACT_SUBVECTOR:
   1785     return Op;
   1786   case ISD::CONCAT_VECTORS:
   1787     return LowerCONCAT_VECTORS(Op, DAG);
   1788   case ISD::STORE:
   1789     return LowerSTORE(Op, DAG);
   1790   case ISD::LOAD:
   1791     return LowerLOAD(Op, DAG);
   1792   case ISD::SHL_PARTS:
   1793     return LowerShiftLeftParts(Op, DAG);
   1794   case ISD::SRA_PARTS:
   1795   case ISD::SRL_PARTS:
   1796     return LowerShiftRightParts(Op, DAG);
   1797   case ISD::SELECT:
   1798     return LowerSelect(Op, DAG);
   1799   default:
   1800     llvm_unreachable("Custom lowering not defined for operation");
   1801   }
   1802 }
   1803 
   1804 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
   1805   SDValue Op0 = Op->getOperand(0);
   1806   SDValue Op1 = Op->getOperand(1);
   1807   SDValue Op2 = Op->getOperand(2);
   1808   SDLoc DL(Op.getNode());
   1809 
   1810   assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
   1811 
   1812   Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
   1813   Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
   1814   SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
   1815   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
   1816 
   1817   return Trunc;
   1818 }
   1819 
   1820 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   1821   if (Op.getValueType() == MVT::i1)
   1822     return LowerLOADi1(Op, DAG);
   1823   else
   1824     return SDValue();
   1825 }
   1826 
   1827 // v = ld i1* addr
   1828 //   =>
   1829 // v1 = ld i8* addr (-> i16)
   1830 // v = trunc i16 to i1
   1831 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
   1832   SDNode *Node = Op.getNode();
   1833   LoadSDNode *LD = cast<LoadSDNode>(Node);
   1834   SDLoc dl(Node);
   1835   assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
   1836   assert(Node->getValueType(0) == MVT::i1 &&
   1837          "Custom lowering for i1 load only");
   1838   SDValue newLD =
   1839       DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
   1840                   LD->getPointerInfo(), LD->isVolatile(), LD->isNonTemporal(),
   1841                   LD->isInvariant(), LD->getAlignment());
   1842   SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
   1843   // The legalizer (the caller) is expecting two values from the legalized
   1844   // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
   1845   // in LegalizeDAG.cpp which also uses MergeValues.
   1846   SDValue Ops[] = { result, LD->getChain() };
   1847   return DAG.getMergeValues(Ops, dl);
   1848 }
   1849 
   1850 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   1851   EVT ValVT = Op.getOperand(1).getValueType();
   1852   if (ValVT == MVT::i1)
   1853     return LowerSTOREi1(Op, DAG);
   1854   else if (ValVT.isVector())
   1855     return LowerSTOREVector(Op, DAG);
   1856   else
   1857     return SDValue();
   1858 }
   1859 
   1860 SDValue
   1861 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
   1862   SDNode *N = Op.getNode();
   1863   SDValue Val = N->getOperand(1);
   1864   SDLoc DL(N);
   1865   EVT ValVT = Val.getValueType();
   1866 
   1867   if (ValVT.isVector()) {
   1868     // We only handle "native" vector sizes for now, e.g. <4 x double> is not
   1869     // legal.  We can (and should) split that into 2 stores of <2 x double> here
   1870     // but I'm leaving that as a TODO for now.
   1871     if (!ValVT.isSimple())
   1872       return SDValue();
   1873     switch (ValVT.getSimpleVT().SimpleTy) {
   1874     default:
   1875       return SDValue();
   1876     case MVT::v2i8:
   1877     case MVT::v2i16:
   1878     case MVT::v2i32:
   1879     case MVT::v2i64:
   1880     case MVT::v2f32:
   1881     case MVT::v2f64:
   1882     case MVT::v4i8:
   1883     case MVT::v4i16:
   1884     case MVT::v4i32:
   1885     case MVT::v4f32:
   1886       // This is a "native" vector type
   1887       break;
   1888     }
   1889 
   1890     MemSDNode *MemSD = cast<MemSDNode>(N);
   1891     const DataLayout *TD = getDataLayout();
   1892 
   1893     unsigned Align = MemSD->getAlignment();
   1894     unsigned PrefAlign =
   1895       TD->getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
   1896     if (Align < PrefAlign) {
   1897       // This store is not sufficiently aligned, so bail out and let this vector
   1898       // store be scalarized.  Note that we may still be able to emit smaller
   1899       // vector stores.  For example, if we are storing a <4 x float> with an
   1900       // alignment of 8, this check will fail but the legalizer will try again
   1901       // with 2 x <2 x float>, which will succeed with an alignment of 8.
   1902       return SDValue();
   1903     }
   1904 
   1905     unsigned Opcode = 0;
   1906     EVT EltVT = ValVT.getVectorElementType();
   1907     unsigned NumElts = ValVT.getVectorNumElements();
   1908 
   1909     // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
   1910     // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
   1911     // stored type to i16 and propagate the "real" type as the memory type.
   1912     bool NeedExt = false;
   1913     if (EltVT.getSizeInBits() < 16)
   1914       NeedExt = true;
   1915 
   1916     switch (NumElts) {
   1917     default:
   1918       return SDValue();
   1919     case 2:
   1920       Opcode = NVPTXISD::StoreV2;
   1921       break;
   1922     case 4: {
   1923       Opcode = NVPTXISD::StoreV4;
   1924       break;
   1925     }
   1926     }
   1927 
   1928     SmallVector<SDValue, 8> Ops;
   1929 
   1930     // First is the chain
   1931     Ops.push_back(N->getOperand(0));
   1932 
   1933     // Then the split values
   1934     for (unsigned i = 0; i < NumElts; ++i) {
   1935       SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
   1936                                    DAG.getIntPtrConstant(i));
   1937       if (NeedExt)
   1938         ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
   1939       Ops.push_back(ExtVal);
   1940     }
   1941 
   1942     // Then any remaining arguments
   1943     Ops.append(N->op_begin() + 2, N->op_end());
   1944 
   1945     SDValue NewSt = DAG.getMemIntrinsicNode(
   1946         Opcode, DL, DAG.getVTList(MVT::Other), Ops,
   1947         MemSD->getMemoryVT(), MemSD->getMemOperand());
   1948 
   1949     //return DCI.CombineTo(N, NewSt, true);
   1950     return NewSt;
   1951   }
   1952 
   1953   return SDValue();
   1954 }
   1955 
   1956 // st i1 v, addr
   1957 //    =>
   1958 // v1 = zxt v to i16
   1959 // st.u8 i16, addr
   1960 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
   1961   SDNode *Node = Op.getNode();
   1962   SDLoc dl(Node);
   1963   StoreSDNode *ST = cast<StoreSDNode>(Node);
   1964   SDValue Tmp1 = ST->getChain();
   1965   SDValue Tmp2 = ST->getBasePtr();
   1966   SDValue Tmp3 = ST->getValue();
   1967   assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
   1968   unsigned Alignment = ST->getAlignment();
   1969   bool isVolatile = ST->isVolatile();
   1970   bool isNonTemporal = ST->isNonTemporal();
   1971   Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
   1972   SDValue Result = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2,
   1973                                      ST->getPointerInfo(), MVT::i8, isNonTemporal,
   1974                                      isVolatile, Alignment);
   1975   return Result;
   1976 }
   1977 
   1978 SDValue NVPTXTargetLowering::getExtSymb(SelectionDAG &DAG, const char *inname,
   1979                                         int idx, EVT v) const {
   1980   std::string *name = nvTM->getManagedStrPool()->getManagedString(inname);
   1981   std::stringstream suffix;
   1982   suffix << idx;
   1983   *name += suffix.str();
   1984   return DAG.getTargetExternalSymbol(name->c_str(), v);
   1985 }
   1986 
   1987 SDValue
   1988 NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
   1989   std::string ParamSym;
   1990   raw_string_ostream ParamStr(ParamSym);
   1991 
   1992   ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
   1993   ParamStr.flush();
   1994 
   1995   std::string *SavedStr =
   1996     nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
   1997   return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
   1998 }
   1999 
   2000 SDValue NVPTXTargetLowering::getParamHelpSymbol(SelectionDAG &DAG, int idx) {
   2001   return getExtSymb(DAG, ".HLPPARAM", idx);
   2002 }
   2003 
   2004 // Check to see if the kernel argument is image*_t or sampler_t
   2005 
   2006 bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) {
   2007   static const char *const specialTypes[] = { "struct._image2d_t",
   2008                                               "struct._image3d_t",
   2009                                               "struct._sampler_t" };
   2010 
   2011   const Type *Ty = arg->getType();
   2012   const PointerType *PTy = dyn_cast<PointerType>(Ty);
   2013 
   2014   if (!PTy)
   2015     return false;
   2016 
   2017   if (!context)
   2018     return false;
   2019 
   2020   const StructType *STy = dyn_cast<StructType>(PTy->getElementType());
   2021   const std::string TypeName = STy && !STy->isLiteral() ? STy->getName() : "";
   2022 
   2023   for (int i = 0, e = array_lengthof(specialTypes); i != e; ++i)
   2024     if (TypeName == specialTypes[i])
   2025       return true;
   2026 
   2027   return false;
   2028 }
   2029 
   2030 SDValue NVPTXTargetLowering::LowerFormalArguments(
   2031     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
   2032     const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
   2033     SmallVectorImpl<SDValue> &InVals) const {
   2034   MachineFunction &MF = DAG.getMachineFunction();
   2035   const DataLayout *TD = getDataLayout();
   2036 
   2037   const Function *F = MF.getFunction();
   2038   const AttributeSet &PAL = F->getAttributes();
   2039   const TargetLowering *TLI = STI.getTargetLowering();
   2040 
   2041   SDValue Root = DAG.getRoot();
   2042   std::vector<SDValue> OutChains;
   2043 
   2044   bool isKernel = llvm::isKernelFunction(*F);
   2045   bool isABI = (STI.getSmVersion() >= 20);
   2046   assert(isABI && "Non-ABI compilation is not supported");
   2047   if (!isABI)
   2048     return Chain;
   2049 
   2050   std::vector<Type *> argTypes;
   2051   std::vector<const Argument *> theArgs;
   2052   for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
   2053        I != E; ++I) {
   2054     theArgs.push_back(I);
   2055     argTypes.push_back(I->getType());
   2056   }
   2057   // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
   2058   // Ins.size() will be larger
   2059   //   * if there is an aggregate argument with multiple fields (each field
   2060   //     showing up separately in Ins)
   2061   //   * if there is a vector argument with more than typical vector-length
   2062   //     elements (generally if more than 4) where each vector element is
   2063   //     individually present in Ins.
   2064   // So a different index should be used for indexing into Ins.
   2065   // See similar issue in LowerCall.
   2066   unsigned InsIdx = 0;
   2067 
   2068   int idx = 0;
   2069   for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
   2070     Type *Ty = argTypes[i];
   2071 
   2072     // If the kernel argument is image*_t or sampler_t, convert it to
   2073     // a i32 constant holding the parameter position. This can later
   2074     // matched in the AsmPrinter to output the correct mangled name.
   2075     if (isImageOrSamplerVal(
   2076             theArgs[i],
   2077             (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
   2078                                      : nullptr))) {
   2079       assert(isKernel && "Only kernels can have image/sampler params");
   2080       InVals.push_back(DAG.getConstant(i + 1, MVT::i32));
   2081       continue;
   2082     }
   2083 
   2084     if (theArgs[i]->use_empty()) {
   2085       // argument is dead
   2086       if (Ty->isAggregateType()) {
   2087         SmallVector<EVT, 16> vtparts;
   2088 
   2089         ComputePTXValueVTs(*this, Ty, vtparts);
   2090         assert(vtparts.size() > 0 && "empty aggregate type not expected");
   2091         for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
   2092              ++parti) {
   2093           InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
   2094           ++InsIdx;
   2095         }
   2096         if (vtparts.size() > 0)
   2097           --InsIdx;
   2098         continue;
   2099       }
   2100       if (Ty->isVectorTy()) {
   2101         EVT ObjectVT = getValueType(Ty);
   2102         unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
   2103         for (unsigned parti = 0; parti < NumRegs; ++parti) {
   2104           InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
   2105           ++InsIdx;
   2106         }
   2107         if (NumRegs > 0)
   2108           --InsIdx;
   2109         continue;
   2110       }
   2111       InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
   2112       continue;
   2113     }
   2114 
   2115     // In the following cases, assign a node order of "idx+1"
   2116     // to newly created nodes. The SDNodes for params have to
   2117     // appear in the same order as their order of appearance
   2118     // in the original function. "idx+1" holds that order.
   2119     if (!PAL.hasAttribute(i + 1, Attribute::ByVal)) {
   2120       if (Ty->isAggregateType()) {
   2121         SmallVector<EVT, 16> vtparts;
   2122         SmallVector<uint64_t, 16> offsets;
   2123 
   2124         // NOTE: Here, we lose the ability to issue vector loads for vectors
   2125         // that are a part of a struct.  This should be investigated in the
   2126         // future.
   2127         ComputePTXValueVTs(*this, Ty, vtparts, &offsets, 0);
   2128         assert(vtparts.size() > 0 && "empty aggregate type not expected");
   2129         bool aggregateIsPacked = false;
   2130         if (StructType *STy = llvm::dyn_cast<StructType>(Ty))
   2131           aggregateIsPacked = STy->isPacked();
   2132 
   2133         SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
   2134         for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
   2135              ++parti) {
   2136           EVT partVT = vtparts[parti];
   2137           Value *srcValue = Constant::getNullValue(
   2138               PointerType::get(partVT.getTypeForEVT(F->getContext()),
   2139                                llvm::ADDRESS_SPACE_PARAM));
   2140           SDValue srcAddr =
   2141               DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
   2142                           DAG.getConstant(offsets[parti], getPointerTy()));
   2143           unsigned partAlign =
   2144               aggregateIsPacked ? 1
   2145                                 : TD->getABITypeAlignment(
   2146                                       partVT.getTypeForEVT(F->getContext()));
   2147           SDValue p;
   2148           if (Ins[InsIdx].VT.getSizeInBits() > partVT.getSizeInBits()) {
   2149             ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ?
   2150                                      ISD::SEXTLOAD : ISD::ZEXTLOAD;
   2151             p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, srcAddr,
   2152                                MachinePointerInfo(srcValue), partVT, false,
   2153                                false, false, partAlign);
   2154           } else {
   2155             p = DAG.getLoad(partVT, dl, Root, srcAddr,
   2156                             MachinePointerInfo(srcValue), false, false, false,
   2157                             partAlign);
   2158           }
   2159           if (p.getNode())
   2160             p.getNode()->setIROrder(idx + 1);
   2161           InVals.push_back(p);
   2162           ++InsIdx;
   2163         }
   2164         if (vtparts.size() > 0)
   2165           --InsIdx;
   2166         continue;
   2167       }
   2168       if (Ty->isVectorTy()) {
   2169         EVT ObjectVT = getValueType(Ty);
   2170         SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
   2171         unsigned NumElts = ObjectVT.getVectorNumElements();
   2172         assert(TLI->getNumRegisters(F->getContext(), ObjectVT) == NumElts &&
   2173                "Vector was not scalarized");
   2174         EVT EltVT = ObjectVT.getVectorElementType();
   2175 
   2176         // V1 load
   2177         // f32 = load ...
   2178         if (NumElts == 1) {
   2179           // We only have one element, so just directly load it
   2180           Value *SrcValue = Constant::getNullValue(PointerType::get(
   2181               EltVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
   2182           SDValue P = DAG.getLoad(
   2183               EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false,
   2184               false, true,
   2185               TD->getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())));
   2186           if (P.getNode())
   2187             P.getNode()->setIROrder(idx + 1);
   2188 
   2189           if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
   2190             P = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, P);
   2191           InVals.push_back(P);
   2192           ++InsIdx;
   2193         } else if (NumElts == 2) {
   2194           // V2 load
   2195           // f32,f32 = load ...
   2196           EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, 2);
   2197           Value *SrcValue = Constant::getNullValue(PointerType::get(
   2198               VecVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
   2199           SDValue P = DAG.getLoad(
   2200               VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false,
   2201               false, true,
   2202               TD->getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
   2203           if (P.getNode())
   2204             P.getNode()->setIROrder(idx + 1);
   2205 
   2206           SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
   2207                                      DAG.getIntPtrConstant(0));
   2208           SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
   2209                                      DAG.getIntPtrConstant(1));
   2210 
   2211           if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) {
   2212             Elt0 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt0);
   2213             Elt1 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt1);
   2214           }
   2215 
   2216           InVals.push_back(Elt0);
   2217           InVals.push_back(Elt1);
   2218           InsIdx += 2;
   2219         } else {
   2220           // V4 loads
   2221           // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
   2222           // the
   2223           // vector will be expanded to a power of 2 elements, so we know we can
   2224           // always round up to the next multiple of 4 when creating the vector
   2225           // loads.
   2226           // e.g.  4 elem => 1 ld.v4
   2227           //       6 elem => 2 ld.v4
   2228           //       8 elem => 2 ld.v4
   2229           //      11 elem => 3 ld.v4
   2230           unsigned VecSize = 4;
   2231           if (EltVT.getSizeInBits() == 64) {
   2232             VecSize = 2;
   2233           }
   2234           EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
   2235           unsigned Ofst = 0;
   2236           for (unsigned i = 0; i < NumElts; i += VecSize) {
   2237             Value *SrcValue = Constant::getNullValue(
   2238                 PointerType::get(VecVT.getTypeForEVT(F->getContext()),
   2239                                  llvm::ADDRESS_SPACE_PARAM));
   2240             SDValue SrcAddr =
   2241                 DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
   2242                             DAG.getConstant(Ofst, getPointerTy()));
   2243             SDValue P = DAG.getLoad(
   2244                 VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false,
   2245                 false, true,
   2246                 TD->getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
   2247             if (P.getNode())
   2248               P.getNode()->setIROrder(idx + 1);
   2249 
   2250             for (unsigned j = 0; j < VecSize; ++j) {
   2251               if (i + j >= NumElts)
   2252                 break;
   2253               SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
   2254                                         DAG.getIntPtrConstant(j));
   2255               if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
   2256                 Elt = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt);
   2257               InVals.push_back(Elt);
   2258             }
   2259             Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
   2260           }
   2261           InsIdx += NumElts;
   2262         }
   2263 
   2264         if (NumElts > 0)
   2265           --InsIdx;
   2266         continue;
   2267       }
   2268       // A plain scalar.
   2269       EVT ObjectVT = getValueType(Ty);
   2270       // If ABI, load from the param symbol
   2271       SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
   2272       Value *srcValue = Constant::getNullValue(PointerType::get(
   2273           ObjectVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
   2274       SDValue p;
   2275        if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) {
   2276         ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ?
   2277                                        ISD::SEXTLOAD : ISD::ZEXTLOAD;
   2278         p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, Arg,
   2279                            MachinePointerInfo(srcValue), ObjectVT, false, false,
   2280                            false,
   2281         TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
   2282       } else {
   2283         p = DAG.getLoad(Ins[InsIdx].VT, dl, Root, Arg,
   2284                         MachinePointerInfo(srcValue), false, false, false,
   2285         TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
   2286       }
   2287       if (p.getNode())
   2288         p.getNode()->setIROrder(idx + 1);
   2289       InVals.push_back(p);
   2290       continue;
   2291     }
   2292 
   2293     // Param has ByVal attribute
   2294     // Return MoveParam(param symbol).
   2295     // Ideally, the param symbol can be returned directly,
   2296     // but when SDNode builder decides to use it in a CopyToReg(),
   2297     // machine instruction fails because TargetExternalSymbol
   2298     // (not lowered) is target dependent, and CopyToReg assumes
   2299     // the source is lowered.
   2300     EVT ObjectVT = getValueType(Ty);
   2301     assert(ObjectVT == Ins[InsIdx].VT &&
   2302            "Ins type did not match function type");
   2303     SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
   2304     SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
   2305     if (p.getNode())
   2306       p.getNode()->setIROrder(idx + 1);
   2307     if (isKernel)
   2308       InVals.push_back(p);
   2309     else {
   2310       SDValue p2 = DAG.getNode(
   2311           ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT,
   2312           DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, MVT::i32), p);
   2313       InVals.push_back(p2);
   2314     }
   2315   }
   2316 
   2317   // Clang will check explicit VarArg and issue error if any. However, Clang
   2318   // will let code with
   2319   // implicit var arg like f() pass. See bug 617733.
   2320   // We treat this case as if the arg list is empty.
   2321   // if (F.isVarArg()) {
   2322   // assert(0 && "VarArg not supported yet!");
   2323   //}
   2324 
   2325   if (!OutChains.empty())
   2326     DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
   2327 
   2328   return Chain;
   2329 }
   2330 
   2331 
   2332 SDValue
   2333 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   2334                                  bool isVarArg,
   2335                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
   2336                                  const SmallVectorImpl<SDValue> &OutVals,
   2337                                  SDLoc dl, SelectionDAG &DAG) const {
   2338   MachineFunction &MF = DAG.getMachineFunction();
   2339   const Function *F = MF.getFunction();
   2340   Type *RetTy = F->getReturnType();
   2341   const DataLayout *TD = getDataLayout();
   2342 
   2343   bool isABI = (STI.getSmVersion() >= 20);
   2344   assert(isABI && "Non-ABI compilation is not supported");
   2345   if (!isABI)
   2346     return Chain;
   2347 
   2348   if (VectorType *VTy = dyn_cast<VectorType>(RetTy)) {
   2349     // If we have a vector type, the OutVals array will be the scalarized
   2350     // components and we have combine them into 1 or more vector stores.
   2351     unsigned NumElts = VTy->getNumElements();
   2352     assert(NumElts == Outs.size() && "Bad scalarization of return value");
   2353 
   2354     // const_cast can be removed in later LLVM versions
   2355     EVT EltVT = getValueType(RetTy).getVectorElementType();
   2356     bool NeedExtend = false;
   2357     if (EltVT.getSizeInBits() < 16)
   2358       NeedExtend = true;
   2359 
   2360     // V1 store
   2361     if (NumElts == 1) {
   2362       SDValue StoreVal = OutVals[0];
   2363       // We only have one element, so just directly store it
   2364       if (NeedExtend)
   2365         StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
   2366       SDValue Ops[] = { Chain, DAG.getConstant(0, MVT::i32), StoreVal };
   2367       Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
   2368                                       DAG.getVTList(MVT::Other), Ops,
   2369                                       EltVT, MachinePointerInfo());
   2370 
   2371     } else if (NumElts == 2) {
   2372       // V2 store
   2373       SDValue StoreVal0 = OutVals[0];
   2374       SDValue StoreVal1 = OutVals[1];
   2375 
   2376       if (NeedExtend) {
   2377         StoreVal0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal0);
   2378         StoreVal1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal1);
   2379       }
   2380 
   2381       SDValue Ops[] = { Chain, DAG.getConstant(0, MVT::i32), StoreVal0,
   2382                         StoreVal1 };
   2383       Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetvalV2, dl,
   2384                                       DAG.getVTList(MVT::Other), Ops,
   2385                                       EltVT, MachinePointerInfo());
   2386     } else {
   2387       // V4 stores
   2388       // We have at least 4 elements (<3 x Ty> expands to 4 elements) and the
   2389       // vector will be expanded to a power of 2 elements, so we know we can
   2390       // always round up to the next multiple of 4 when creating the vector
   2391       // stores.
   2392       // e.g.  4 elem => 1 st.v4
   2393       //       6 elem => 2 st.v4
   2394       //       8 elem => 2 st.v4
   2395       //      11 elem => 3 st.v4
   2396 
   2397       unsigned VecSize = 4;
   2398       if (OutVals[0].getValueType().getSizeInBits() == 64)
   2399         VecSize = 2;
   2400 
   2401       unsigned Offset = 0;
   2402 
   2403       EVT VecVT =
   2404           EVT::getVectorVT(F->getContext(), EltVT, VecSize);
   2405       unsigned PerStoreOffset =
   2406           TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
   2407 
   2408       for (unsigned i = 0; i < NumElts; i += VecSize) {
   2409         // Get values
   2410         SDValue StoreVal;
   2411         SmallVector<SDValue, 8> Ops;
   2412         Ops.push_back(Chain);
   2413         Ops.push_back(DAG.getConstant(Offset, MVT::i32));
   2414         unsigned Opc = NVPTXISD::StoreRetvalV2;
   2415         EVT ExtendedVT = (NeedExtend) ? MVT::i16 : OutVals[0].getValueType();
   2416 
   2417         StoreVal = OutVals[i];
   2418         if (NeedExtend)
   2419           StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
   2420         Ops.push_back(StoreVal);
   2421 
   2422         if (i + 1 < NumElts) {
   2423           StoreVal = OutVals[i + 1];
   2424           if (NeedExtend)
   2425             StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
   2426         } else {
   2427           StoreVal = DAG.getUNDEF(ExtendedVT);
   2428         }
   2429         Ops.push_back(StoreVal);
   2430 
   2431         if (VecSize == 4) {
   2432           Opc = NVPTXISD::StoreRetvalV4;
   2433           if (i + 2 < NumElts) {
   2434             StoreVal = OutVals[i + 2];
   2435             if (NeedExtend)
   2436               StoreVal =
   2437                   DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
   2438           } else {
   2439             StoreVal = DAG.getUNDEF(ExtendedVT);
   2440           }
   2441           Ops.push_back(StoreVal);
   2442 
   2443           if (i + 3 < NumElts) {
   2444             StoreVal = OutVals[i + 3];
   2445             if (NeedExtend)
   2446               StoreVal =
   2447                   DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
   2448           } else {
   2449             StoreVal = DAG.getUNDEF(ExtendedVT);
   2450           }
   2451           Ops.push_back(StoreVal);
   2452         }
   2453 
   2454         // Chain = DAG.getNode(Opc, dl, MVT::Other, &Ops[0], Ops.size());
   2455         Chain =
   2456             DAG.getMemIntrinsicNode(Opc, dl, DAG.getVTList(MVT::Other), Ops,
   2457                                     EltVT, MachinePointerInfo());
   2458         Offset += PerStoreOffset;
   2459       }
   2460     }
   2461   } else {
   2462     SmallVector<EVT, 16> ValVTs;
   2463     SmallVector<uint64_t, 16> Offsets;
   2464     ComputePTXValueVTs(*this, RetTy, ValVTs, &Offsets, 0);
   2465     assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition");
   2466 
   2467     for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
   2468       SDValue theVal = OutVals[i];
   2469       EVT TheValType = theVal.getValueType();
   2470       unsigned numElems = 1;
   2471       if (TheValType.isVector())
   2472         numElems = TheValType.getVectorNumElements();
   2473       for (unsigned j = 0, je = numElems; j != je; ++j) {
   2474         SDValue TmpVal = theVal;
   2475         if (TheValType.isVector())
   2476           TmpVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
   2477                                TheValType.getVectorElementType(), TmpVal,
   2478                                DAG.getIntPtrConstant(j));
   2479         EVT TheStoreType = ValVTs[i];
   2480         if (RetTy->isIntegerTy() &&
   2481             TD->getTypeAllocSizeInBits(RetTy) < 32) {
   2482           // The following zero-extension is for integer types only, and
   2483           // specifically not for aggregates.
   2484           TmpVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, TmpVal);
   2485           TheStoreType = MVT::i32;
   2486         }
   2487         else if (TmpVal.getValueType().getSizeInBits() < 16)
   2488           TmpVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, TmpVal);
   2489 
   2490         SDValue Ops[] = {
   2491           Chain,
   2492           DAG.getConstant(Offsets[i], MVT::i32),
   2493           TmpVal };
   2494         Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
   2495                                         DAG.getVTList(MVT::Other), Ops,
   2496                                         TheStoreType,
   2497                                         MachinePointerInfo());
   2498       }
   2499     }
   2500   }
   2501 
   2502   return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
   2503 }
   2504 
   2505 
   2506 void NVPTXTargetLowering::LowerAsmOperandForConstraint(
   2507     SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
   2508     SelectionDAG &DAG) const {
   2509   if (Constraint.length() > 1)
   2510     return;
   2511   else
   2512     TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
   2513 }
   2514 
   2515 // NVPTX suuport vector of legal types of any length in Intrinsics because the
   2516 // NVPTX specific type legalizer
   2517 // will legalize them to the PTX supported length.
   2518 bool NVPTXTargetLowering::isTypeSupportedInIntrinsic(MVT VT) const {
   2519   if (isTypeLegal(VT))
   2520     return true;
   2521   if (VT.isVector()) {
   2522     MVT eVT = VT.getVectorElementType();
   2523     if (isTypeLegal(eVT))
   2524       return true;
   2525   }
   2526   return false;
   2527 }
   2528 
   2529 static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
   2530   switch (Intrinsic) {
   2531   default:
   2532     return 0;
   2533 
   2534   case Intrinsic::nvvm_tex_1d_v4f32_s32:
   2535     return NVPTXISD::Tex1DFloatS32;
   2536   case Intrinsic::nvvm_tex_1d_v4f32_f32:
   2537     return NVPTXISD::Tex1DFloatFloat;
   2538   case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
   2539     return NVPTXISD::Tex1DFloatFloatLevel;
   2540   case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
   2541     return NVPTXISD::Tex1DFloatFloatGrad;
   2542   case Intrinsic::nvvm_tex_1d_v4s32_s32:
   2543     return NVPTXISD::Tex1DS32S32;
   2544   case Intrinsic::nvvm_tex_1d_v4s32_f32:
   2545     return NVPTXISD::Tex1DS32Float;
   2546   case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
   2547     return NVPTXISD::Tex1DS32FloatLevel;
   2548   case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
   2549     return NVPTXISD::Tex1DS32FloatGrad;
   2550   case Intrinsic::nvvm_tex_1d_v4u32_s32:
   2551     return NVPTXISD::Tex1DU32S32;
   2552   case Intrinsic::nvvm_tex_1d_v4u32_f32:
   2553     return NVPTXISD::Tex1DU32Float;
   2554   case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
   2555     return NVPTXISD::Tex1DU32FloatLevel;
   2556   case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
   2557     return NVPTXISD::Tex1DU32FloatGrad;
   2558 
   2559   case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
   2560     return NVPTXISD::Tex1DArrayFloatS32;
   2561   case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
   2562     return NVPTXISD::Tex1DArrayFloatFloat;
   2563   case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
   2564     return NVPTXISD::Tex1DArrayFloatFloatLevel;
   2565   case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
   2566     return NVPTXISD::Tex1DArrayFloatFloatGrad;
   2567   case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
   2568     return NVPTXISD::Tex1DArrayS32S32;
   2569   case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
   2570     return NVPTXISD::Tex1DArrayS32Float;
   2571   case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
   2572     return NVPTXISD::Tex1DArrayS32FloatLevel;
   2573   case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
   2574     return NVPTXISD::Tex1DArrayS32FloatGrad;
   2575   case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
   2576     return NVPTXISD::Tex1DArrayU32S32;
   2577   case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
   2578     return NVPTXISD::Tex1DArrayU32Float;
   2579   case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
   2580     return NVPTXISD::Tex1DArrayU32FloatLevel;
   2581   case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
   2582     return NVPTXISD::Tex1DArrayU32FloatGrad;
   2583 
   2584   case Intrinsic::nvvm_tex_2d_v4f32_s32:
   2585     return NVPTXISD::Tex2DFloatS32;
   2586   case Intrinsic::nvvm_tex_2d_v4f32_f32:
   2587     return NVPTXISD::Tex2DFloatFloat;
   2588   case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
   2589     return NVPTXISD::Tex2DFloatFloatLevel;
   2590   case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
   2591     return NVPTXISD::Tex2DFloatFloatGrad;
   2592   case Intrinsic::nvvm_tex_2d_v4s32_s32:
   2593     return NVPTXISD::Tex2DS32S32;
   2594   case Intrinsic::nvvm_tex_2d_v4s32_f32:
   2595     return NVPTXISD::Tex2DS32Float;
   2596   case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
   2597     return NVPTXISD::Tex2DS32FloatLevel;
   2598   case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
   2599     return NVPTXISD::Tex2DS32FloatGrad;
   2600   case Intrinsic::nvvm_tex_2d_v4u32_s32:
   2601     return NVPTXISD::Tex2DU32S32;
   2602   case Intrinsic::nvvm_tex_2d_v4u32_f32:
   2603     return NVPTXISD::Tex2DU32Float;
   2604   case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
   2605     return NVPTXISD::Tex2DU32FloatLevel;
   2606   case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
   2607     return NVPTXISD::Tex2DU32FloatGrad;
   2608 
   2609   case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
   2610     return NVPTXISD::Tex2DArrayFloatS32;
   2611   case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
   2612     return NVPTXISD::Tex2DArrayFloatFloat;
   2613   case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
   2614     return NVPTXISD::Tex2DArrayFloatFloatLevel;
   2615   case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
   2616     return NVPTXISD::Tex2DArrayFloatFloatGrad;
   2617   case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
   2618     return NVPTXISD::Tex2DArrayS32S32;
   2619   case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
   2620     return NVPTXISD::Tex2DArrayS32Float;
   2621   case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
   2622     return NVPTXISD::Tex2DArrayS32FloatLevel;
   2623   case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
   2624     return NVPTXISD::Tex2DArrayS32FloatGrad;
   2625   case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
   2626     return NVPTXISD::Tex2DArrayU32S32;
   2627   case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
   2628     return NVPTXISD::Tex2DArrayU32Float;
   2629   case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
   2630     return NVPTXISD::Tex2DArrayU32FloatLevel;
   2631   case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
   2632     return NVPTXISD::Tex2DArrayU32FloatGrad;
   2633 
   2634   case Intrinsic::nvvm_tex_3d_v4f32_s32:
   2635     return NVPTXISD::Tex3DFloatS32;
   2636   case Intrinsic::nvvm_tex_3d_v4f32_f32:
   2637     return NVPTXISD::Tex3DFloatFloat;
   2638   case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
   2639     return NVPTXISD::Tex3DFloatFloatLevel;
   2640   case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
   2641     return NVPTXISD::Tex3DFloatFloatGrad;
   2642   case Intrinsic::nvvm_tex_3d_v4s32_s32:
   2643     return NVPTXISD::Tex3DS32S32;
   2644   case Intrinsic::nvvm_tex_3d_v4s32_f32:
   2645     return NVPTXISD::Tex3DS32Float;
   2646   case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
   2647     return NVPTXISD::Tex3DS32FloatLevel;
   2648   case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
   2649     return NVPTXISD::Tex3DS32FloatGrad;
   2650   case Intrinsic::nvvm_tex_3d_v4u32_s32:
   2651     return NVPTXISD::Tex3DU32S32;
   2652   case Intrinsic::nvvm_tex_3d_v4u32_f32:
   2653     return NVPTXISD::Tex3DU32Float;
   2654   case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
   2655     return NVPTXISD::Tex3DU32FloatLevel;
   2656   case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
   2657     return NVPTXISD::Tex3DU32FloatGrad;
   2658 
   2659   case Intrinsic::nvvm_tex_cube_v4f32_f32:
   2660     return NVPTXISD::TexCubeFloatFloat;
   2661   case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
   2662     return NVPTXISD::TexCubeFloatFloatLevel;
   2663   case Intrinsic::nvvm_tex_cube_v4s32_f32:
   2664     return NVPTXISD::TexCubeS32Float;
   2665   case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
   2666     return NVPTXISD::TexCubeS32FloatLevel;
   2667   case Intrinsic::nvvm_tex_cube_v4u32_f32:
   2668     return NVPTXISD::TexCubeU32Float;
   2669   case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
   2670     return NVPTXISD::TexCubeU32FloatLevel;
   2671 
   2672   case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
   2673     return NVPTXISD::TexCubeArrayFloatFloat;
   2674   case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
   2675     return NVPTXISD::TexCubeArrayFloatFloatLevel;
   2676   case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
   2677     return NVPTXISD::TexCubeArrayS32Float;
   2678   case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
   2679     return NVPTXISD::TexCubeArrayS32FloatLevel;
   2680   case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
   2681     return NVPTXISD::TexCubeArrayU32Float;
   2682   case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
   2683     return NVPTXISD::TexCubeArrayU32FloatLevel;
   2684 
   2685   case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
   2686     return NVPTXISD::Tld4R2DFloatFloat;
   2687   case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
   2688     return NVPTXISD::Tld4G2DFloatFloat;
   2689   case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
   2690     return NVPTXISD::Tld4B2DFloatFloat;
   2691   case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
   2692     return NVPTXISD::Tld4A2DFloatFloat;
   2693   case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
   2694     return NVPTXISD::Tld4R2DS64Float;
   2695   case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
   2696     return NVPTXISD::Tld4G2DS64Float;
   2697   case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
   2698     return NVPTXISD::Tld4B2DS64Float;
   2699   case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
   2700     return NVPTXISD::Tld4A2DS64Float;
   2701   case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
   2702     return NVPTXISD::Tld4R2DU64Float;
   2703   case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
   2704     return NVPTXISD::Tld4G2DU64Float;
   2705   case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
   2706     return NVPTXISD::Tld4B2DU64Float;
   2707   case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
   2708     return NVPTXISD::Tld4A2DU64Float;
   2709 
   2710   case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
   2711     return NVPTXISD::TexUnified1DFloatS32;
   2712   case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
   2713     return NVPTXISD::TexUnified1DFloatFloat;
   2714   case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
   2715     return NVPTXISD::TexUnified1DFloatFloatLevel;
   2716   case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
   2717     return NVPTXISD::TexUnified1DFloatFloatGrad;
   2718   case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
   2719     return NVPTXISD::TexUnified1DS32S32;
   2720   case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
   2721     return NVPTXISD::TexUnified1DS32Float;
   2722   case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
   2723     return NVPTXISD::TexUnified1DS32FloatLevel;
   2724   case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
   2725     return NVPTXISD::TexUnified1DS32FloatGrad;
   2726   case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
   2727     return NVPTXISD::TexUnified1DU32S32;
   2728   case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
   2729     return NVPTXISD::TexUnified1DU32Float;
   2730   case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
   2731     return NVPTXISD::TexUnified1DU32FloatLevel;
   2732   case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
   2733     return NVPTXISD::TexUnified1DU32FloatGrad;
   2734 
   2735   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
   2736     return NVPTXISD::TexUnified1DArrayFloatS32;
   2737   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
   2738     return NVPTXISD::TexUnified1DArrayFloatFloat;
   2739   case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
   2740     return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
   2741   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
   2742     return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
   2743   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
   2744     return NVPTXISD::TexUnified1DArrayS32S32;
   2745   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
   2746     return NVPTXISD::TexUnified1DArrayS32Float;
   2747   case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
   2748     return NVPTXISD::TexUnified1DArrayS32FloatLevel;
   2749   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
   2750     return NVPTXISD::TexUnified1DArrayS32FloatGrad;
   2751   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
   2752     return NVPTXISD::TexUnified1DArrayU32S32;
   2753   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
   2754     return NVPTXISD::TexUnified1DArrayU32Float;
   2755   case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
   2756     return NVPTXISD::TexUnified1DArrayU32FloatLevel;
   2757   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
   2758     return NVPTXISD::TexUnified1DArrayU32FloatGrad;
   2759 
   2760   case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
   2761     return NVPTXISD::TexUnified2DFloatS32;
   2762   case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
   2763     return NVPTXISD::TexUnified2DFloatFloat;
   2764   case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
   2765     return NVPTXISD::TexUnified2DFloatFloatLevel;
   2766   case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
   2767     return NVPTXISD::TexUnified2DFloatFloatGrad;
   2768   case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
   2769     return NVPTXISD::TexUnified2DS32S32;
   2770   case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
   2771     return NVPTXISD::TexUnified2DS32Float;
   2772   case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
   2773     return NVPTXISD::TexUnified2DS32FloatLevel;
   2774   case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
   2775     return NVPTXISD::TexUnified2DS32FloatGrad;
   2776   case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
   2777     return NVPTXISD::TexUnified2DU32S32;
   2778   case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
   2779     return NVPTXISD::TexUnified2DU32Float;
   2780   case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
   2781     return NVPTXISD::TexUnified2DU32FloatLevel;
   2782   case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
   2783     return NVPTXISD::TexUnified2DU32FloatGrad;
   2784 
   2785   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
   2786     return NVPTXISD::TexUnified2DArrayFloatS32;
   2787   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
   2788     return NVPTXISD::TexUnified2DArrayFloatFloat;
   2789   case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
   2790     return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
   2791   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
   2792     return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
   2793   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
   2794     return NVPTXISD::TexUnified2DArrayS32S32;
   2795   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
   2796     return NVPTXISD::TexUnified2DArrayS32Float;
   2797   case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
   2798     return NVPTXISD::TexUnified2DArrayS32FloatLevel;
   2799   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
   2800     return NVPTXISD::TexUnified2DArrayS32FloatGrad;
   2801   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
   2802     return NVPTXISD::TexUnified2DArrayU32S32;
   2803   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
   2804     return NVPTXISD::TexUnified2DArrayU32Float;
   2805   case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
   2806     return NVPTXISD::TexUnified2DArrayU32FloatLevel;
   2807   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
   2808     return NVPTXISD::TexUnified2DArrayU32FloatGrad;
   2809 
   2810   case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
   2811     return NVPTXISD::TexUnified3DFloatS32;
   2812   case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
   2813     return NVPTXISD::TexUnified3DFloatFloat;
   2814   case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
   2815     return NVPTXISD::TexUnified3DFloatFloatLevel;
   2816   case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
   2817     return NVPTXISD::TexUnified3DFloatFloatGrad;
   2818   case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
   2819     return NVPTXISD::TexUnified3DS32S32;
   2820   case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
   2821     return NVPTXISD::TexUnified3DS32Float;
   2822   case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
   2823     return NVPTXISD::TexUnified3DS32FloatLevel;
   2824   case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
   2825     return NVPTXISD::TexUnified3DS32FloatGrad;
   2826   case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
   2827     return NVPTXISD::TexUnified3DU32S32;
   2828   case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
   2829     return NVPTXISD::TexUnified3DU32Float;
   2830   case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
   2831     return NVPTXISD::TexUnified3DU32FloatLevel;
   2832   case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
   2833     return NVPTXISD::TexUnified3DU32FloatGrad;
   2834 
   2835   case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
   2836     return NVPTXISD::TexUnifiedCubeFloatFloat;
   2837   case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
   2838     return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
   2839   case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
   2840     return NVPTXISD::TexUnifiedCubeS32Float;
   2841   case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
   2842     return NVPTXISD::TexUnifiedCubeS32FloatLevel;
   2843   case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
   2844     return NVPTXISD::TexUnifiedCubeU32Float;
   2845   case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
   2846     return NVPTXISD::TexUnifiedCubeU32FloatLevel;
   2847 
   2848   case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
   2849     return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
   2850   case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
   2851     return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
   2852   case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
   2853     return NVPTXISD::TexUnifiedCubeArrayS32Float;
   2854   case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
   2855     return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
   2856   case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
   2857     return NVPTXISD::TexUnifiedCubeArrayU32Float;
   2858   case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
   2859     return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
   2860 
   2861   case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
   2862     return NVPTXISD::Tld4UnifiedR2DFloatFloat;
   2863   case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
   2864     return NVPTXISD::Tld4UnifiedG2DFloatFloat;
   2865   case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
   2866     return NVPTXISD::Tld4UnifiedB2DFloatFloat;
   2867   case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
   2868     return NVPTXISD::Tld4UnifiedA2DFloatFloat;
   2869   case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
   2870     return NVPTXISD::Tld4UnifiedR2DS64Float;
   2871   case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
   2872     return NVPTXISD::Tld4UnifiedG2DS64Float;
   2873   case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
   2874     return NVPTXISD::Tld4UnifiedB2DS64Float;
   2875   case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
   2876     return NVPTXISD::Tld4UnifiedA2DS64Float;
   2877   case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
   2878     return NVPTXISD::Tld4UnifiedR2DU64Float;
   2879   case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
   2880     return NVPTXISD::Tld4UnifiedG2DU64Float;
   2881   case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
   2882     return NVPTXISD::Tld4UnifiedB2DU64Float;
   2883   case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
   2884     return NVPTXISD::Tld4UnifiedA2DU64Float;
   2885   }
   2886 }
   2887 
   2888 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
   2889   switch (Intrinsic) {
   2890   default:
   2891     return 0;
   2892   case Intrinsic::nvvm_suld_1d_i8_clamp:
   2893     return NVPTXISD::Suld1DI8Clamp;
   2894   case Intrinsic::nvvm_suld_1d_i16_clamp:
   2895     return NVPTXISD::Suld1DI16Clamp;
   2896   case Intrinsic::nvvm_suld_1d_i32_clamp:
   2897     return NVPTXISD::Suld1DI32Clamp;
   2898   case Intrinsic::nvvm_suld_1d_i64_clamp:
   2899     return NVPTXISD::Suld1DI64Clamp;
   2900   case Intrinsic::nvvm_suld_1d_v2i8_clamp:
   2901     return NVPTXISD::Suld1DV2I8Clamp;
   2902   case Intrinsic::nvvm_suld_1d_v2i16_clamp:
   2903     return NVPTXISD::Suld1DV2I16Clamp;
   2904   case Intrinsic::nvvm_suld_1d_v2i32_clamp:
   2905     return NVPTXISD::Suld1DV2I32Clamp;
   2906   case Intrinsic::nvvm_suld_1d_v2i64_clamp:
   2907     return NVPTXISD::Suld1DV2I64Clamp;
   2908   case Intrinsic::nvvm_suld_1d_v4i8_clamp:
   2909     return NVPTXISD::Suld1DV4I8Clamp;
   2910   case Intrinsic::nvvm_suld_1d_v4i16_clamp:
   2911     return NVPTXISD::Suld1DV4I16Clamp;
   2912   case Intrinsic::nvvm_suld_1d_v4i32_clamp:
   2913     return NVPTXISD::Suld1DV4I32Clamp;
   2914   case Intrinsic::nvvm_suld_1d_array_i8_clamp:
   2915     return NVPTXISD::Suld1DArrayI8Clamp;
   2916   case Intrinsic::nvvm_suld_1d_array_i16_clamp:
   2917     return NVPTXISD::Suld1DArrayI16Clamp;
   2918   case Intrinsic::nvvm_suld_1d_array_i32_clamp:
   2919     return NVPTXISD::Suld1DArrayI32Clamp;
   2920   case Intrinsic::nvvm_suld_1d_array_i64_clamp:
   2921     return NVPTXISD::Suld1DArrayI64Clamp;
   2922   case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
   2923     return NVPTXISD::Suld1DArrayV2I8Clamp;
   2924   case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
   2925     return NVPTXISD::Suld1DArrayV2I16Clamp;
   2926   case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
   2927     return NVPTXISD::Suld1DArrayV2I32Clamp;
   2928   case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
   2929     return NVPTXISD::Suld1DArrayV2I64Clamp;
   2930   case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
   2931     return NVPTXISD::Suld1DArrayV4I8Clamp;
   2932   case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
   2933     return NVPTXISD::Suld1DArrayV4I16Clamp;
   2934   case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
   2935     return NVPTXISD::Suld1DArrayV4I32Clamp;
   2936   case Intrinsic::nvvm_suld_2d_i8_clamp:
   2937     return NVPTXISD::Suld2DI8Clamp;
   2938   case Intrinsic::nvvm_suld_2d_i16_clamp:
   2939     return NVPTXISD::Suld2DI16Clamp;
   2940   case Intrinsic::nvvm_suld_2d_i32_clamp:
   2941     return NVPTXISD::Suld2DI32Clamp;
   2942   case Intrinsic::nvvm_suld_2d_i64_clamp:
   2943     return NVPTXISD::Suld2DI64Clamp;
   2944   case Intrinsic::nvvm_suld_2d_v2i8_clamp:
   2945     return NVPTXISD::Suld2DV2I8Clamp;
   2946   case Intrinsic::nvvm_suld_2d_v2i16_clamp:
   2947     return NVPTXISD::Suld2DV2I16Clamp;
   2948   case Intrinsic::nvvm_suld_2d_v2i32_clamp:
   2949     return NVPTXISD::Suld2DV2I32Clamp;
   2950   case Intrinsic::nvvm_suld_2d_v2i64_clamp:
   2951     return NVPTXISD::Suld2DV2I64Clamp;
   2952   case Intrinsic::nvvm_suld_2d_v4i8_clamp:
   2953     return NVPTXISD::Suld2DV4I8Clamp;
   2954   case Intrinsic::nvvm_suld_2d_v4i16_clamp:
   2955     return NVPTXISD::Suld2DV4I16Clamp;
   2956   case Intrinsic::nvvm_suld_2d_v4i32_clamp:
   2957     return NVPTXISD::Suld2DV4I32Clamp;
   2958   case Intrinsic::nvvm_suld_2d_array_i8_clamp:
   2959     return NVPTXISD::Suld2DArrayI8Clamp;
   2960   case Intrinsic::nvvm_suld_2d_array_i16_clamp:
   2961     return NVPTXISD::Suld2DArrayI16Clamp;
   2962   case Intrinsic::nvvm_suld_2d_array_i32_clamp:
   2963     return NVPTXISD::Suld2DArrayI32Clamp;
   2964   case Intrinsic::nvvm_suld_2d_array_i64_clamp:
   2965     return NVPTXISD::Suld2DArrayI64Clamp;
   2966   case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
   2967     return NVPTXISD::Suld2DArrayV2I8Clamp;
   2968   case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
   2969     return NVPTXISD::Suld2DArrayV2I16Clamp;
   2970   case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
   2971     return NVPTXISD::Suld2DArrayV2I32Clamp;
   2972   case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
   2973     return NVPTXISD::Suld2DArrayV2I64Clamp;
   2974   case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
   2975     return NVPTXISD::Suld2DArrayV4I8Clamp;
   2976   case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
   2977     return NVPTXISD::Suld2DArrayV4I16Clamp;
   2978   case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
   2979     return NVPTXISD::Suld2DArrayV4I32Clamp;
   2980   case Intrinsic::nvvm_suld_3d_i8_clamp:
   2981     return NVPTXISD::Suld3DI8Clamp;
   2982   case Intrinsic::nvvm_suld_3d_i16_clamp:
   2983     return NVPTXISD::Suld3DI16Clamp;
   2984   case Intrinsic::nvvm_suld_3d_i32_clamp:
   2985     return NVPTXISD::Suld3DI32Clamp;
   2986   case Intrinsic::nvvm_suld_3d_i64_clamp:
   2987     return NVPTXISD::Suld3DI64Clamp;
   2988   case Intrinsic::nvvm_suld_3d_v2i8_clamp:
   2989     return NVPTXISD::Suld3DV2I8Clamp;
   2990   case Intrinsic::nvvm_suld_3d_v2i16_clamp:
   2991     return NVPTXISD::Suld3DV2I16Clamp;
   2992   case Intrinsic::nvvm_suld_3d_v2i32_clamp:
   2993     return NVPTXISD::Suld3DV2I32Clamp;
   2994   case Intrinsic::nvvm_suld_3d_v2i64_clamp:
   2995     return NVPTXISD::Suld3DV2I64Clamp;
   2996   case Intrinsic::nvvm_suld_3d_v4i8_clamp:
   2997     return NVPTXISD::Suld3DV4I8Clamp;
   2998   case Intrinsic::nvvm_suld_3d_v4i16_clamp:
   2999     return NVPTXISD::Suld3DV4I16Clamp;
   3000   case Intrinsic::nvvm_suld_3d_v4i32_clamp:
   3001     return NVPTXISD::Suld3DV4I32Clamp;
   3002   case Intrinsic::nvvm_suld_1d_i8_trap:
   3003     return NVPTXISD::Suld1DI8Trap;
   3004   case Intrinsic::nvvm_suld_1d_i16_trap:
   3005     return NVPTXISD::Suld1DI16Trap;
   3006   case Intrinsic::nvvm_suld_1d_i32_trap:
   3007     return NVPTXISD::Suld1DI32Trap;
   3008   case Intrinsic::nvvm_suld_1d_i64_trap:
   3009     return NVPTXISD::Suld1DI64Trap;
   3010   case Intrinsic::nvvm_suld_1d_v2i8_trap:
   3011     return NVPTXISD::Suld1DV2I8Trap;
   3012   case Intrinsic::nvvm_suld_1d_v2i16_trap:
   3013     return NVPTXISD::Suld1DV2I16Trap;
   3014   case Intrinsic::nvvm_suld_1d_v2i32_trap:
   3015     return NVPTXISD::Suld1DV2I32Trap;
   3016   case Intrinsic::nvvm_suld_1d_v2i64_trap:
   3017     return NVPTXISD::Suld1DV2I64Trap;
   3018   case Intrinsic::nvvm_suld_1d_v4i8_trap:
   3019     return NVPTXISD::Suld1DV4I8Trap;
   3020   case Intrinsic::nvvm_suld_1d_v4i16_trap:
   3021     return NVPTXISD::Suld1DV4I16Trap;
   3022   case Intrinsic::nvvm_suld_1d_v4i32_trap:
   3023     return NVPTXISD::Suld1DV4I32Trap;
   3024   case Intrinsic::nvvm_suld_1d_array_i8_trap:
   3025     return NVPTXISD::Suld1DArrayI8Trap;
   3026   case Intrinsic::nvvm_suld_1d_array_i16_trap:
   3027     return NVPTXISD::Suld1DArrayI16Trap;
   3028   case Intrinsic::nvvm_suld_1d_array_i32_trap:
   3029     return NVPTXISD::Suld1DArrayI32Trap;
   3030   case Intrinsic::nvvm_suld_1d_array_i64_trap:
   3031     return NVPTXISD::Suld1DArrayI64Trap;
   3032   case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
   3033     return NVPTXISD::Suld1DArrayV2I8Trap;
   3034   case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
   3035     return NVPTXISD::Suld1DArrayV2I16Trap;
   3036   case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
   3037     return NVPTXISD::Suld1DArrayV2I32Trap;
   3038   case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
   3039     return NVPTXISD::Suld1DArrayV2I64Trap;
   3040   case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
   3041     return NVPTXISD::Suld1DArrayV4I8Trap;
   3042   case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
   3043     return NVPTXISD::Suld1DArrayV4I16Trap;
   3044   case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
   3045     return NVPTXISD::Suld1DArrayV4I32Trap;
   3046   case Intrinsic::nvvm_suld_2d_i8_trap:
   3047     return NVPTXISD::Suld2DI8Trap;
   3048   case Intrinsic::nvvm_suld_2d_i16_trap:
   3049     return NVPTXISD::Suld2DI16Trap;
   3050   case Intrinsic::nvvm_suld_2d_i32_trap:
   3051     return NVPTXISD::Suld2DI32Trap;
   3052   case Intrinsic::nvvm_suld_2d_i64_trap:
   3053     return NVPTXISD::Suld2DI64Trap;
   3054   case Intrinsic::nvvm_suld_2d_v2i8_trap:
   3055     return NVPTXISD::Suld2DV2I8Trap;
   3056   case Intrinsic::nvvm_suld_2d_v2i16_trap:
   3057     return NVPTXISD::Suld2DV2I16Trap;
   3058   case Intrinsic::nvvm_suld_2d_v2i32_trap:
   3059     return NVPTXISD::Suld2DV2I32Trap;
   3060   case Intrinsic::nvvm_suld_2d_v2i64_trap:
   3061     return NVPTXISD::Suld2DV2I64Trap;
   3062   case Intrinsic::nvvm_suld_2d_v4i8_trap:
   3063     return NVPTXISD::Suld2DV4I8Trap;
   3064   case Intrinsic::nvvm_suld_2d_v4i16_trap:
   3065     return NVPTXISD::Suld2DV4I16Trap;
   3066   case Intrinsic::nvvm_suld_2d_v4i32_trap:
   3067     return NVPTXISD::Suld2DV4I32Trap;
   3068   case Intrinsic::nvvm_suld_2d_array_i8_trap:
   3069     return NVPTXISD::Suld2DArrayI8Trap;
   3070   case Intrinsic::nvvm_suld_2d_array_i16_trap:
   3071     return NVPTXISD::Suld2DArrayI16Trap;
   3072   case Intrinsic::nvvm_suld_2d_array_i32_trap:
   3073     return NVPTXISD::Suld2DArrayI32Trap;
   3074   case Intrinsic::nvvm_suld_2d_array_i64_trap:
   3075     return NVPTXISD::Suld2DArrayI64Trap;
   3076   case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
   3077     return NVPTXISD::Suld2DArrayV2I8Trap;
   3078   case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
   3079     return NVPTXISD::Suld2DArrayV2I16Trap;
   3080   case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
   3081     return NVPTXISD::Suld2DArrayV2I32Trap;
   3082   case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
   3083     return NVPTXISD::Suld2DArrayV2I64Trap;
   3084   case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
   3085     return NVPTXISD::Suld2DArrayV4I8Trap;
   3086   case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
   3087     return NVPTXISD::Suld2DArrayV4I16Trap;
   3088   case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
   3089     return NVPTXISD::Suld2DArrayV4I32Trap;
   3090   case Intrinsic::nvvm_suld_3d_i8_trap:
   3091     return NVPTXISD::Suld3DI8Trap;
   3092   case Intrinsic::nvvm_suld_3d_i16_trap:
   3093     return NVPTXISD::Suld3DI16Trap;
   3094   case Intrinsic::nvvm_suld_3d_i32_trap:
   3095     return NVPTXISD::Suld3DI32Trap;
   3096   case Intrinsic::nvvm_suld_3d_i64_trap:
   3097     return NVPTXISD::Suld3DI64Trap;
   3098   case Intrinsic::nvvm_suld_3d_v2i8_trap:
   3099     return NVPTXISD::Suld3DV2I8Trap;
   3100   case Intrinsic::nvvm_suld_3d_v2i16_trap:
   3101     return NVPTXISD::Suld3DV2I16Trap;
   3102   case Intrinsic::nvvm_suld_3d_v2i32_trap:
   3103     return NVPTXISD::Suld3DV2I32Trap;
   3104   case Intrinsic::nvvm_suld_3d_v2i64_trap:
   3105     return NVPTXISD::Suld3DV2I64Trap;
   3106   case Intrinsic::nvvm_suld_3d_v4i8_trap:
   3107     return NVPTXISD::Suld3DV4I8Trap;
   3108   case Intrinsic::nvvm_suld_3d_v4i16_trap:
   3109     return NVPTXISD::Suld3DV4I16Trap;
   3110   case Intrinsic::nvvm_suld_3d_v4i32_trap:
   3111     return NVPTXISD::Suld3DV4I32Trap;
   3112   case Intrinsic::nvvm_suld_1d_i8_zero:
   3113     return NVPTXISD::Suld1DI8Zero;
   3114   case Intrinsic::nvvm_suld_1d_i16_zero:
   3115     return NVPTXISD::Suld1DI16Zero;
   3116   case Intrinsic::nvvm_suld_1d_i32_zero:
   3117     return NVPTXISD::Suld1DI32Zero;
   3118   case Intrinsic::nvvm_suld_1d_i64_zero:
   3119     return NVPTXISD::Suld1DI64Zero;
   3120   case Intrinsic::nvvm_suld_1d_v2i8_zero:
   3121     return NVPTXISD::Suld1DV2I8Zero;
   3122   case Intrinsic::nvvm_suld_1d_v2i16_zero:
   3123     return NVPTXISD::Suld1DV2I16Zero;
   3124   case Intrinsic::nvvm_suld_1d_v2i32_zero:
   3125     return NVPTXISD::Suld1DV2I32Zero;
   3126   case Intrinsic::nvvm_suld_1d_v2i64_zero:
   3127     return NVPTXISD::Suld1DV2I64Zero;
   3128   case Intrinsic::nvvm_suld_1d_v4i8_zero:
   3129     return NVPTXISD::Suld1DV4I8Zero;
   3130   case Intrinsic::nvvm_suld_1d_v4i16_zero:
   3131     return NVPTXISD::Suld1DV4I16Zero;
   3132   case Intrinsic::nvvm_suld_1d_v4i32_zero:
   3133     return NVPTXISD::Suld1DV4I32Zero;
   3134   case Intrinsic::nvvm_suld_1d_array_i8_zero:
   3135     return NVPTXISD::Suld1DArrayI8Zero;
   3136   case Intrinsic::nvvm_suld_1d_array_i16_zero:
   3137     return NVPTXISD::Suld1DArrayI16Zero;
   3138   case Intrinsic::nvvm_suld_1d_array_i32_zero:
   3139     return NVPTXISD::Suld1DArrayI32Zero;
   3140   case Intrinsic::nvvm_suld_1d_array_i64_zero:
   3141     return NVPTXISD::Suld1DArrayI64Zero;
   3142   case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
   3143     return NVPTXISD::Suld1DArrayV2I8Zero;
   3144   case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
   3145     return NVPTXISD::Suld1DArrayV2I16Zero;
   3146   case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
   3147     return NVPTXISD::Suld1DArrayV2I32Zero;
   3148   case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
   3149     return NVPTXISD::Suld1DArrayV2I64Zero;
   3150   case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
   3151     return NVPTXISD::Suld1DArrayV4I8Zero;
   3152   case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
   3153     return NVPTXISD::Suld1DArrayV4I16Zero;
   3154   case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
   3155     return NVPTXISD::Suld1DArrayV4I32Zero;
   3156   case Intrinsic::nvvm_suld_2d_i8_zero:
   3157     return NVPTXISD::Suld2DI8Zero;
   3158   case Intrinsic::nvvm_suld_2d_i16_zero:
   3159     return NVPTXISD::Suld2DI16Zero;
   3160   case Intrinsic::nvvm_suld_2d_i32_zero:
   3161     return NVPTXISD::Suld2DI32Zero;
   3162   case Intrinsic::nvvm_suld_2d_i64_zero:
   3163     return NVPTXISD::Suld2DI64Zero;
   3164   case Intrinsic::nvvm_suld_2d_v2i8_zero:
   3165     return NVPTXISD::Suld2DV2I8Zero;
   3166   case Intrinsic::nvvm_suld_2d_v2i16_zero:
   3167     return NVPTXISD::Suld2DV2I16Zero;
   3168   case Intrinsic::nvvm_suld_2d_v2i32_zero:
   3169     return NVPTXISD::Suld2DV2I32Zero;
   3170   case Intrinsic::nvvm_suld_2d_v2i64_zero:
   3171     return NVPTXISD::Suld2DV2I64Zero;
   3172   case Intrinsic::nvvm_suld_2d_v4i8_zero:
   3173     return NVPTXISD::Suld2DV4I8Zero;
   3174   case Intrinsic::nvvm_suld_2d_v4i16_zero:
   3175     return NVPTXISD::Suld2DV4I16Zero;
   3176   case Intrinsic::nvvm_suld_2d_v4i32_zero:
   3177     return NVPTXISD::Suld2DV4I32Zero;
   3178   case Intrinsic::nvvm_suld_2d_array_i8_zero:
   3179     return NVPTXISD::Suld2DArrayI8Zero;
   3180   case Intrinsic::nvvm_suld_2d_array_i16_zero:
   3181     return NVPTXISD::Suld2DArrayI16Zero;
   3182   case Intrinsic::nvvm_suld_2d_array_i32_zero:
   3183     return NVPTXISD::Suld2DArrayI32Zero;
   3184   case Intrinsic::nvvm_suld_2d_array_i64_zero:
   3185     return NVPTXISD::Suld2DArrayI64Zero;
   3186   case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
   3187     return NVPTXISD::Suld2DArrayV2I8Zero;
   3188   case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
   3189     return NVPTXISD::Suld2DArrayV2I16Zero;
   3190   case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
   3191     return NVPTXISD::Suld2DArrayV2I32Zero;
   3192   case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
   3193     return NVPTXISD::Suld2DArrayV2I64Zero;
   3194   case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
   3195     return NVPTXISD::Suld2DArrayV4I8Zero;
   3196   case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
   3197     return NVPTXISD::Suld2DArrayV4I16Zero;
   3198   case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
   3199     return NVPTXISD::Suld2DArrayV4I32Zero;
   3200   case Intrinsic::nvvm_suld_3d_i8_zero:
   3201     return NVPTXISD::Suld3DI8Zero;
   3202   case Intrinsic::nvvm_suld_3d_i16_zero:
   3203     return NVPTXISD::Suld3DI16Zero;
   3204   case Intrinsic::nvvm_suld_3d_i32_zero:
   3205     return NVPTXISD::Suld3DI32Zero;
   3206   case Intrinsic::nvvm_suld_3d_i64_zero:
   3207     return NVPTXISD::Suld3DI64Zero;
   3208   case Intrinsic::nvvm_suld_3d_v2i8_zero:
   3209     return NVPTXISD::Suld3DV2I8Zero;
   3210   case Intrinsic::nvvm_suld_3d_v2i16_zero:
   3211     return NVPTXISD::Suld3DV2I16Zero;
   3212   case Intrinsic::nvvm_suld_3d_v2i32_zero:
   3213     return NVPTXISD::Suld3DV2I32Zero;
   3214   case Intrinsic::nvvm_suld_3d_v2i64_zero:
   3215     return NVPTXISD::Suld3DV2I64Zero;
   3216   case Intrinsic::nvvm_suld_3d_v4i8_zero:
   3217     return NVPTXISD::Suld3DV4I8Zero;
   3218   case Intrinsic::nvvm_suld_3d_v4i16_zero:
   3219     return NVPTXISD::Suld3DV4I16Zero;
   3220   case Intrinsic::nvvm_suld_3d_v4i32_zero:
   3221     return NVPTXISD::Suld3DV4I32Zero;
   3222   }
   3223 }
   3224 
   3225 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
   3226 // TgtMemIntrinsic
   3227 // because we need the information that is only available in the "Value" type
   3228 // of destination
   3229 // pointer. In particular, the address space information.
   3230 bool NVPTXTargetLowering::getTgtMemIntrinsic(
   3231     IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const {
   3232   switch (Intrinsic) {
   3233   default:
   3234     return false;
   3235 
   3236   case Intrinsic::nvvm_atomic_load_add_f32:
   3237     Info.opc = ISD::INTRINSIC_W_CHAIN;
   3238     Info.memVT = MVT::f32;
   3239     Info.ptrVal = I.getArgOperand(0);
   3240     Info.offset = 0;
   3241     Info.vol = 0;
   3242     Info.readMem = true;
   3243     Info.writeMem = true;
   3244     Info.align = 0;
   3245     return true;
   3246 
   3247   case Intrinsic::nvvm_atomic_load_inc_32:
   3248   case Intrinsic::nvvm_atomic_load_dec_32:
   3249     Info.opc = ISD::INTRINSIC_W_CHAIN;
   3250     Info.memVT = MVT::i32;
   3251     Info.ptrVal = I.getArgOperand(0);
   3252     Info.offset = 0;
   3253     Info.vol = 0;
   3254     Info.readMem = true;
   3255     Info.writeMem = true;
   3256     Info.align = 0;
   3257     return true;
   3258 
   3259   case Intrinsic::nvvm_ldu_global_i:
   3260   case Intrinsic::nvvm_ldu_global_f:
   3261   case Intrinsic::nvvm_ldu_global_p: {
   3262 
   3263     Info.opc = ISD::INTRINSIC_W_CHAIN;
   3264     if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
   3265       Info.memVT = getValueType(I.getType());
   3266     else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
   3267       Info.memVT = getPointerTy();
   3268     else
   3269       Info.memVT = getValueType(I.getType());
   3270     Info.ptrVal = I.getArgOperand(0);
   3271     Info.offset = 0;
   3272     Info.vol = 0;
   3273     Info.readMem = true;
   3274     Info.writeMem = false;
   3275     Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
   3276 
   3277     return true;
   3278   }
   3279   case Intrinsic::nvvm_ldg_global_i:
   3280   case Intrinsic::nvvm_ldg_global_f:
   3281   case Intrinsic::nvvm_ldg_global_p: {
   3282 
   3283     Info.opc = ISD::INTRINSIC_W_CHAIN;
   3284     if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
   3285       Info.memVT = getValueType(I.getType());
   3286     else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
   3287       Info.memVT = getPointerTy();
   3288     else
   3289       Info.memVT = getValueType(I.getType());
   3290     Info.ptrVal = I.getArgOperand(0);
   3291     Info.offset = 0;
   3292     Info.vol = 0;
   3293     Info.readMem = true;
   3294     Info.writeMem = false;
   3295     Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
   3296 
   3297     return true;
   3298   }
   3299 
   3300   case Intrinsic::nvvm_tex_1d_v4f32_s32:
   3301   case Intrinsic::nvvm_tex_1d_v4f32_f32:
   3302   case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
   3303   case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
   3304   case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
   3305   case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
   3306   case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
   3307   case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
   3308   case Intrinsic::nvvm_tex_2d_v4f32_s32:
   3309   case Intrinsic::nvvm_tex_2d_v4f32_f32:
   3310   case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
   3311   case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
   3312   case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
   3313   case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
   3314   case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
   3315   case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
   3316   case Intrinsic::nvvm_tex_3d_v4f32_s32:
   3317   case Intrinsic::nvvm_tex_3d_v4f32_f32:
   3318   case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
   3319   case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
   3320   case Intrinsic::nvvm_tex_cube_v4f32_f32:
   3321   case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
   3322   case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
   3323   case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
   3324   case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
   3325   case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
   3326   case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
   3327   case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
   3328   case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
   3329   case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
   3330   case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
   3331   case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
   3332   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
   3333   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
   3334   case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
   3335   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
   3336   case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
   3337   case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
   3338   case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
   3339   case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
   3340   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
   3341   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
   3342   case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
   3343   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
   3344   case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
   3345   case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
   3346   case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
   3347   case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
   3348   case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
   3349   case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
   3350   case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
   3351   case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
   3352   case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
   3353   case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
   3354   case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
   3355   case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: {
   3356     Info.opc = getOpcForTextureInstr(Intrinsic);
   3357     Info.memVT = MVT::v4f32;
   3358     Info.ptrVal = nullptr;
   3359     Info.offset = 0;
   3360     Info.vol = 0;
   3361     Info.readMem = true;
   3362     Info.writeMem = false;
   3363     Info.align = 16;
   3364     return true;
   3365   }
   3366   case Intrinsic::nvvm_tex_1d_v4s32_s32:
   3367   case Intrinsic::nvvm_tex_1d_v4s32_f32:
   3368   case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
   3369   case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
   3370   case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
   3371   case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
   3372   case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
   3373   case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
   3374   case Intrinsic::nvvm_tex_2d_v4s32_s32:
   3375   case Intrinsic::nvvm_tex_2d_v4s32_f32:
   3376   case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
   3377   case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
   3378   case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
   3379   case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
   3380   case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
   3381   case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
   3382   case Intrinsic::nvvm_tex_3d_v4s32_s32:
   3383   case Intrinsic::nvvm_tex_3d_v4s32_f32:
   3384   case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
   3385   case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
   3386   case Intrinsic::nvvm_tex_cube_v4s32_f32:
   3387   case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
   3388   case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
   3389   case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
   3390   case Intrinsic::nvvm_tex_cube_v4u32_f32:
   3391   case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
   3392   case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
   3393   case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
   3394   case Intrinsic::nvvm_tex_1d_v4u32_s32:
   3395   case Intrinsic::nvvm_tex_1d_v4u32_f32:
   3396   case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
   3397   case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
   3398   case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
   3399   case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
   3400   case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
   3401   case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
   3402   case Intrinsic::nvvm_tex_2d_v4u32_s32:
   3403   case Intrinsic::nvvm_tex_2d_v4u32_f32:
   3404   case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
   3405   case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
   3406   case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
   3407   case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
   3408   case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
   3409   case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
   3410   case Intrinsic::nvvm_tex_3d_v4u32_s32:
   3411   case Intrinsic::nvvm_tex_3d_v4u32_f32:
   3412   case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
   3413   case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
   3414   case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
   3415   case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
   3416   case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
   3417   case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
   3418   case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
   3419   case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
   3420   case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
   3421   case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
   3422   case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
   3423   case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
   3424   case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
   3425   case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
   3426   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
   3427   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
   3428   case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
   3429   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
   3430   case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
   3431   case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
   3432   case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
   3433   case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
   3434   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
   3435   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
   3436   case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
   3437   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
   3438   case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
   3439   case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
   3440   case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
   3441   case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
   3442   case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
   3443   case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
   3444   case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
   3445   case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
   3446   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
   3447   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
   3448   case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
   3449   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
   3450   case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
   3451   case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
   3452   case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
   3453   case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
   3454   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
   3455   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
   3456   case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
   3457   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
   3458   case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
   3459   case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
   3460   case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
   3461   case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
   3462   case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
   3463   case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
   3464   case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
   3465   case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
   3466   case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
   3467   case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
   3468   case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
   3469   case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
   3470   case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
   3471   case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
   3472   case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
   3473   case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
   3474   case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
   3475   case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
   3476   case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
   3477   case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: {
   3478     Info.opc = getOpcForTextureInstr(Intrinsic);
   3479     Info.memVT = MVT::v4i32;
   3480     Info.ptrVal = nullptr;
   3481     Info.offset = 0;
   3482     Info.vol = 0;
   3483     Info.readMem = true;
   3484     Info.writeMem = false;
   3485     Info.align = 16;
   3486     return true;
   3487   }
   3488   case Intrinsic::nvvm_suld_1d_i8_clamp:
   3489   case Intrinsic::nvvm_suld_1d_v2i8_clamp:
   3490   case Intrinsic::nvvm_suld_1d_v4i8_clamp:
   3491   case Intrinsic::nvvm_suld_1d_array_i8_clamp:
   3492   case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
   3493   case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
   3494   case Intrinsic::nvvm_suld_2d_i8_clamp:
   3495   case Intrinsic::nvvm_suld_2d_v2i8_clamp:
   3496   case Intrinsic::nvvm_suld_2d_v4i8_clamp:
   3497   case Intrinsic::nvvm_suld_2d_array_i8_clamp:
   3498   case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
   3499   case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
   3500   case Intrinsic::nvvm_suld_3d_i8_clamp:
   3501   case Intrinsic::nvvm_suld_3d_v2i8_clamp:
   3502   case Intrinsic::nvvm_suld_3d_v4i8_clamp:
   3503   case Intrinsic::nvvm_suld_1d_i8_trap:
   3504   case Intrinsic::nvvm_suld_1d_v2i8_trap:
   3505   case Intrinsic::nvvm_suld_1d_v4i8_trap:
   3506   case Intrinsic::nvvm_suld_1d_array_i8_trap:
   3507   case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
   3508   case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
   3509   case Intrinsic::nvvm_suld_2d_i8_trap:
   3510   case Intrinsic::nvvm_suld_2d_v2i8_trap:
   3511   case Intrinsic::nvvm_suld_2d_v4i8_trap:
   3512   case Intrinsic::nvvm_suld_2d_array_i8_trap:
   3513   case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
   3514   case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
   3515   case Intrinsic::nvvm_suld_3d_i8_trap:
   3516   case Intrinsic::nvvm_suld_3d_v2i8_trap:
   3517   case Intrinsic::nvvm_suld_3d_v4i8_trap:
   3518   case Intrinsic::nvvm_suld_1d_i8_zero:
   3519   case Intrinsic::nvvm_suld_1d_v2i8_zero:
   3520   case Intrinsic::nvvm_suld_1d_v4i8_zero:
   3521   case Intrinsic::nvvm_suld_1d_array_i8_zero:
   3522   case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
   3523   case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
   3524   case Intrinsic::nvvm_suld_2d_i8_zero:
   3525   case Intrinsic::nvvm_suld_2d_v2i8_zero:
   3526   case Intrinsic::nvvm_suld_2d_v4i8_zero:
   3527   case Intrinsic::nvvm_suld_2d_array_i8_zero:
   3528   case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
   3529   case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
   3530   case Intrinsic::nvvm_suld_3d_i8_zero:
   3531   case Intrinsic::nvvm_suld_3d_v2i8_zero:
   3532   case Intrinsic::nvvm_suld_3d_v4i8_zero: {
   3533     Info.opc = getOpcForSurfaceInstr(Intrinsic);
   3534     Info.memVT = MVT::i8;
   3535     Info.ptrVal = nullptr;
   3536     Info.offset = 0;
   3537     Info.vol = 0;
   3538     Info.readMem = true;
   3539     Info.writeMem = false;
   3540     Info.align = 16;
   3541     return true;
   3542   }
   3543   case Intrinsic::nvvm_suld_1d_i16_clamp:
   3544   case Intrinsic::nvvm_suld_1d_v2i16_clamp:
   3545   case Intrinsic::nvvm_suld_1d_v4i16_clamp:
   3546   case Intrinsic::nvvm_suld_1d_array_i16_clamp:
   3547   case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
   3548   case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
   3549   case Intrinsic::nvvm_suld_2d_i16_clamp:
   3550   case Intrinsic::nvvm_suld_2d_v2i16_clamp:
   3551   case Intrinsic::nvvm_suld_2d_v4i16_clamp:
   3552   case Intrinsic::nvvm_suld_2d_array_i16_clamp:
   3553   case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
   3554   case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
   3555   case Intrinsic::nvvm_suld_3d_i16_clamp:
   3556   case Intrinsic::nvvm_suld_3d_v2i16_clamp:
   3557   case Intrinsic::nvvm_suld_3d_v4i16_clamp:
   3558   case Intrinsic::nvvm_suld_1d_i16_trap:
   3559   case Intrinsic::nvvm_suld_1d_v2i16_trap:
   3560   case Intrinsic::nvvm_suld_1d_v4i16_trap:
   3561   case Intrinsic::nvvm_suld_1d_array_i16_trap:
   3562   case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
   3563   case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
   3564   case Intrinsic::nvvm_suld_2d_i16_trap:
   3565   case Intrinsic::nvvm_suld_2d_v2i16_trap:
   3566   case Intrinsic::nvvm_suld_2d_v4i16_trap:
   3567   case Intrinsic::nvvm_suld_2d_array_i16_trap:
   3568   case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
   3569   case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
   3570   case Intrinsic::nvvm_suld_3d_i16_trap:
   3571   case Intrinsic::nvvm_suld_3d_v2i16_trap:
   3572   case Intrinsic::nvvm_suld_3d_v4i16_trap:
   3573   case Intrinsic::nvvm_suld_1d_i16_zero:
   3574   case Intrinsic::nvvm_suld_1d_v2i16_zero:
   3575   case Intrinsic::nvvm_suld_1d_v4i16_zero:
   3576   case Intrinsic::nvvm_suld_1d_array_i16_zero:
   3577   case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
   3578   case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
   3579   case Intrinsic::nvvm_suld_2d_i16_zero:
   3580   case Intrinsic::nvvm_suld_2d_v2i16_zero:
   3581   case Intrinsic::nvvm_suld_2d_v4i16_zero:
   3582   case Intrinsic::nvvm_suld_2d_array_i16_zero:
   3583   case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
   3584   case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
   3585   case Intrinsic::nvvm_suld_3d_i16_zero:
   3586   case Intrinsic::nvvm_suld_3d_v2i16_zero:
   3587   case Intrinsic::nvvm_suld_3d_v4i16_zero: {
   3588     Info.opc = getOpcForSurfaceInstr(Intrinsic);
   3589     Info.memVT = MVT::i16;
   3590     Info.ptrVal = nullptr;
   3591     Info.offset = 0;
   3592     Info.vol = 0;
   3593     Info.readMem = true;
   3594     Info.writeMem = false;
   3595     Info.align = 16;
   3596     return true;
   3597   }
   3598   case Intrinsic::nvvm_suld_1d_i32_clamp:
   3599   case Intrinsic::nvvm_suld_1d_v2i32_clamp:
   3600   case Intrinsic::nvvm_suld_1d_v4i32_clamp:
   3601   case Intrinsic::nvvm_suld_1d_array_i32_clamp:
   3602   case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
   3603   case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
   3604   case Intrinsic::nvvm_suld_2d_i32_clamp:
   3605   case Intrinsic::nvvm_suld_2d_v2i32_clamp:
   3606   case Intrinsic::nvvm_suld_2d_v4i32_clamp:
   3607   case Intrinsic::nvvm_suld_2d_array_i32_clamp:
   3608   case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
   3609   case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
   3610   case Intrinsic::nvvm_suld_3d_i32_clamp:
   3611   case Intrinsic::nvvm_suld_3d_v2i32_clamp:
   3612   case Intrinsic::nvvm_suld_3d_v4i32_clamp:
   3613   case Intrinsic::nvvm_suld_1d_i32_trap:
   3614   case Intrinsic::nvvm_suld_1d_v2i32_trap:
   3615   case Intrinsic::nvvm_suld_1d_v4i32_trap:
   3616   case Intrinsic::nvvm_suld_1d_array_i32_trap:
   3617   case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
   3618   case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
   3619   case Intrinsic::nvvm_suld_2d_i32_trap:
   3620   case Intrinsic::nvvm_suld_2d_v2i32_trap:
   3621   case Intrinsic::nvvm_suld_2d_v4i32_trap:
   3622   case Intrinsic::nvvm_suld_2d_array_i32_trap:
   3623   case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
   3624   case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
   3625   case Intrinsic::nvvm_suld_3d_i32_trap:
   3626   case Intrinsic::nvvm_suld_3d_v2i32_trap:
   3627   case Intrinsic::nvvm_suld_3d_v4i32_trap:
   3628   case Intrinsic::nvvm_suld_1d_i32_zero:
   3629   case Intrinsic::nvvm_suld_1d_v2i32_zero:
   3630   case Intrinsic::nvvm_suld_1d_v4i32_zero:
   3631   case Intrinsic::nvvm_suld_1d_array_i32_zero:
   3632   case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
   3633   case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
   3634   case Intrinsic::nvvm_suld_2d_i32_zero:
   3635   case Intrinsic::nvvm_suld_2d_v2i32_zero:
   3636   case Intrinsic::nvvm_suld_2d_v4i32_zero:
   3637   case Intrinsic::nvvm_suld_2d_array_i32_zero:
   3638   case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
   3639   case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
   3640   case Intrinsic::nvvm_suld_3d_i32_zero:
   3641   case Intrinsic::nvvm_suld_3d_v2i32_zero:
   3642   case Intrinsic::nvvm_suld_3d_v4i32_zero: {
   3643     Info.opc = getOpcForSurfaceInstr(Intrinsic);
   3644     Info.memVT = MVT::i32;
   3645     Info.ptrVal = nullptr;
   3646     Info.offset = 0;
   3647     Info.vol = 0;
   3648     Info.readMem = true;
   3649     Info.writeMem = false;
   3650     Info.align = 16;
   3651     return true;
   3652   }
   3653   case Intrinsic::nvvm_suld_1d_i64_clamp:
   3654   case Intrinsic::nvvm_suld_1d_v2i64_clamp:
   3655   case Intrinsic::nvvm_suld_1d_array_i64_clamp:
   3656   case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
   3657   case Intrinsic::nvvm_suld_2d_i64_clamp:
   3658   case Intrinsic::nvvm_suld_2d_v2i64_clamp:
   3659   case Intrinsic::nvvm_suld_2d_array_i64_clamp:
   3660   case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
   3661   case Intrinsic::nvvm_suld_3d_i64_clamp:
   3662   case Intrinsic::nvvm_suld_3d_v2i64_clamp:
   3663   case Intrinsic::nvvm_suld_1d_i64_trap:
   3664   case Intrinsic::nvvm_suld_1d_v2i64_trap:
   3665   case Intrinsic::nvvm_suld_1d_array_i64_trap:
   3666   case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
   3667   case Intrinsic::nvvm_suld_2d_i64_trap:
   3668   case Intrinsic::nvvm_suld_2d_v2i64_trap:
   3669   case Intrinsic::nvvm_suld_2d_array_i64_trap:
   3670   case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
   3671   case Intrinsic::nvvm_suld_3d_i64_trap:
   3672   case Intrinsic::nvvm_suld_3d_v2i64_trap:
   3673   case Intrinsic::nvvm_suld_1d_i64_zero:
   3674   case Intrinsic::nvvm_suld_1d_v2i64_zero:
   3675   case Intrinsic::nvvm_suld_1d_array_i64_zero:
   3676   case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
   3677   case Intrinsic::nvvm_suld_2d_i64_zero:
   3678   case Intrinsic::nvvm_suld_2d_v2i64_zero:
   3679   case Intrinsic::nvvm_suld_2d_array_i64_zero:
   3680   case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
   3681   case Intrinsic::nvvm_suld_3d_i64_zero:
   3682   case Intrinsic::nvvm_suld_3d_v2i64_zero: {
   3683     Info.opc = getOpcForSurfaceInstr(Intrinsic);
   3684     Info.memVT = MVT::i64;
   3685     Info.ptrVal = nullptr;
   3686     Info.offset = 0;
   3687     Info.vol = 0;
   3688     Info.readMem = true;
   3689     Info.writeMem = false;
   3690     Info.align = 16;
   3691     return true;
   3692   }
   3693   }
   3694   return false;
   3695 }
   3696 
   3697 /// isLegalAddressingMode - Return true if the addressing mode represented
   3698 /// by AM is legal for this target, for a load/store of the specified type.
   3699 /// Used to guide target specific optimizations, like loop strength reduction
   3700 /// (LoopStrengthReduce.cpp) and memory optimization for address mode
   3701 /// (CodeGenPrepare.cpp)
   3702 bool NVPTXTargetLowering::isLegalAddressingMode(const AddrMode &AM,
   3703                                                 Type *Ty) const {
   3704 
   3705   // AddrMode - This represents an addressing mode of:
   3706   //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
   3707   //
   3708   // The legal address modes are
   3709   // - [avar]
   3710   // - [areg]
   3711   // - [areg+immoff]
   3712   // - [immAddr]
   3713 
   3714   if (AM.BaseGV) {
   3715     if (AM.BaseOffs || AM.HasBaseReg || AM.Scale)
   3716       return false;
   3717     return true;
   3718   }
   3719 
   3720   switch (AM.Scale) {
   3721   case 0: // "r", "r+i" or "i" is allowed
   3722     break;
   3723   case 1:
   3724     if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
   3725       return false;
   3726     // Otherwise we have r+i.
   3727     break;
   3728   default:
   3729     // No scale > 1 is allowed
   3730     return false;
   3731   }
   3732   return true;
   3733 }
   3734 
   3735 //===----------------------------------------------------------------------===//
   3736 //                         NVPTX Inline Assembly Support
   3737 //===----------------------------------------------------------------------===//
   3738 
   3739 /// getConstraintType - Given a constraint letter, return the type of
   3740 /// constraint it is for this target.
   3741 NVPTXTargetLowering::ConstraintType
   3742 NVPTXTargetLowering::getConstraintType(const std::string &Constraint) const {
   3743   if (Constraint.size() == 1) {
   3744     switch (Constraint[0]) {
   3745     default:
   3746       break;
   3747     case 'b':
   3748     case 'r':
   3749     case 'h':
   3750     case 'c':
   3751     case 'l':
   3752     case 'f':
   3753     case 'd':
   3754     case '0':
   3755     case 'N':
   3756       return C_RegisterClass;
   3757     }
   3758   }
   3759   return TargetLowering::getConstraintType(Constraint);
   3760 }
   3761 
   3762 std::pair<unsigned, const TargetRegisterClass *>
   3763 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   3764                                                   const std::string &Constraint,
   3765                                                   MVT VT) const {
   3766   if (Constraint.size() == 1) {
   3767     switch (Constraint[0]) {
   3768     case 'b':
   3769       return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
   3770     case 'c':
   3771       return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
   3772     case 'h':
   3773       return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
   3774     case 'r':
   3775       return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
   3776     case 'l':
   3777     case 'N':
   3778       return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
   3779     case 'f':
   3780       return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
   3781     case 'd':
   3782       return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
   3783     }
   3784   }
   3785   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
   3786 }
   3787 
   3788 /// getFunctionAlignment - Return the Log2 alignment of this function.
   3789 unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const {
   3790   return 4;
   3791 }
   3792 
   3793 //===----------------------------------------------------------------------===//
   3794 //                         NVPTX DAG Combining
   3795 //===----------------------------------------------------------------------===//
   3796 
   3797 bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
   3798                                    CodeGenOpt::Level OptLevel) const {
   3799   const Function *F = MF.getFunction();
   3800   const TargetOptions &TO = MF.getTarget().Options;
   3801 
   3802   // Always honor command-line argument
   3803   if (FMAContractLevelOpt.getNumOccurrences() > 0) {
   3804     return FMAContractLevelOpt > 0;
   3805   } else if (OptLevel == 0) {
   3806     // Do not contract if we're not optimizing the code
   3807     return false;
   3808   } else if (TO.AllowFPOpFusion == FPOpFusion::Fast || TO.UnsafeFPMath) {
   3809     // Honor TargetOptions flags that explicitly say fusion is okay
   3810     return true;
   3811   } else if (F->hasFnAttribute("unsafe-fp-math")) {
   3812     // Check for unsafe-fp-math=true coming from Clang
   3813     Attribute Attr = F->getFnAttribute("unsafe-fp-math");
   3814     StringRef Val = Attr.getValueAsString();
   3815     if (Val == "true")
   3816       return true;
   3817   }
   3818 
   3819   // We did not have a clear indication that fusion is allowed, so assume not
   3820   return false;
   3821 }
   3822 
   3823 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
   3824 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
   3825 /// called with the default operands, and if that fails, with commuted
   3826 /// operands.
   3827 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
   3828                                            TargetLowering::DAGCombinerInfo &DCI,
   3829                                              const NVPTXSubtarget &Subtarget,
   3830                                              CodeGenOpt::Level OptLevel) {
   3831   SelectionDAG  &DAG = DCI.DAG;
   3832   // Skip non-integer, non-scalar case
   3833   EVT VT=N0.getValueType();
   3834   if (VT.isVector())
   3835     return SDValue();
   3836 
   3837   // fold (add (mul a, b), c) -> (mad a, b, c)
   3838   //
   3839   if (N0.getOpcode() == ISD::MUL) {
   3840     assert (VT.isInteger());
   3841     // For integer:
   3842     // Since integer multiply-add costs the same as integer multiply
   3843     // but is more costly than integer add, do the fusion only when
   3844     // the mul is only used in the add.
   3845     if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
   3846         !N0.getNode()->hasOneUse())
   3847       return SDValue();
   3848 
   3849     // Do the folding
   3850     return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
   3851                        N0.getOperand(0), N0.getOperand(1), N1);
   3852   }
   3853   else if (N0.getOpcode() == ISD::FMUL) {
   3854     if (VT == MVT::f32 || VT == MVT::f64) {
   3855       const auto *TLI = static_cast<const NVPTXTargetLowering *>(
   3856           &DAG.getTargetLoweringInfo());
   3857       if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
   3858         return SDValue();
   3859 
   3860       // For floating point:
   3861       // Do the fusion only when the mul has less than 5 uses and all
   3862       // are add.
   3863       // The heuristic is that if a use is not an add, then that use
   3864       // cannot be fused into fma, therefore mul is still needed anyway.
   3865       // If there are more than 4 uses, even if they are all add, fusing
   3866       // them will increase register pressue.
   3867       //
   3868       int numUses = 0;
   3869       int nonAddCount = 0;
   3870       for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
   3871            UE = N0.getNode()->use_end();
   3872            UI != UE; ++UI) {
   3873         numUses++;
   3874         SDNode *User = *UI;
   3875         if (User->getOpcode() != ISD::FADD)
   3876           ++nonAddCount;
   3877       }
   3878       if (numUses >= 5)
   3879         return SDValue();
   3880       if (nonAddCount) {
   3881         int orderNo = N->getIROrder();
   3882         int orderNo2 = N0.getNode()->getIROrder();
   3883         // simple heuristics here for considering potential register
   3884         // pressure, the logics here is that the differnce are used
   3885         // to measure the distance between def and use, the longer distance
   3886         // more likely cause register pressure.
   3887         if (orderNo - orderNo2 < 500)
   3888           return SDValue();
   3889 
   3890         // Now, check if at least one of the FMUL's operands is live beyond the node N,
   3891         // which guarantees that the FMA will not increase register pressure at node N.
   3892         bool opIsLive = false;
   3893         const SDNode *left = N0.getOperand(0).getNode();
   3894         const SDNode *right = N0.getOperand(1).getNode();
   3895 
   3896         if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
   3897           opIsLive = true;
   3898 
   3899         if (!opIsLive)
   3900           for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
   3901             SDNode *User = *UI;
   3902             int orderNo3 = User->getIROrder();
   3903             if (orderNo3 > orderNo) {
   3904               opIsLive = true;
   3905               break;
   3906             }
   3907           }
   3908 
   3909         if (!opIsLive)
   3910           for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
   3911             SDNode *User = *UI;
   3912             int orderNo3 = User->getIROrder();
   3913             if (orderNo3 > orderNo) {
   3914               opIsLive = true;
   3915               break;
   3916             }
   3917           }
   3918 
   3919         if (!opIsLive)
   3920           return SDValue();
   3921       }
   3922 
   3923       return DAG.getNode(ISD::FMA, SDLoc(N), VT,
   3924                          N0.getOperand(0), N0.getOperand(1), N1);
   3925     }
   3926   }
   3927 
   3928   return SDValue();
   3929 }
   3930 
   3931 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
   3932 ///
   3933 static SDValue PerformADDCombine(SDNode *N,
   3934                                  TargetLowering::DAGCombinerInfo &DCI,
   3935                                  const NVPTXSubtarget &Subtarget,
   3936                                  CodeGenOpt::Level OptLevel) {
   3937   SDValue N0 = N->getOperand(0);
   3938   SDValue N1 = N->getOperand(1);
   3939 
   3940   // First try with the default operand order.
   3941   SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget,
   3942                                                  OptLevel);
   3943   if (Result.getNode())
   3944     return Result;
   3945 
   3946   // If that didn't work, try again with the operands commuted.
   3947   return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
   3948 }
   3949 
   3950 static SDValue PerformANDCombine(SDNode *N,
   3951                                  TargetLowering::DAGCombinerInfo &DCI) {
   3952   // The type legalizer turns a vector load of i8 values into a zextload to i16
   3953   // registers, optionally ANY_EXTENDs it (if target type is integer),
   3954   // and ANDs off the high 8 bits. Since we turn this load into a
   3955   // target-specific DAG node, the DAG combiner fails to eliminate these AND
   3956   // nodes. Do that here.
   3957   SDValue Val = N->getOperand(0);
   3958   SDValue Mask = N->getOperand(1);
   3959 
   3960   if (isa<ConstantSDNode>(Val)) {
   3961     std::swap(Val, Mask);
   3962   }
   3963 
   3964   SDValue AExt;
   3965   // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
   3966   if (Val.getOpcode() == ISD::ANY_EXTEND) {
   3967     AExt = Val;
   3968     Val = Val->getOperand(0);
   3969   }
   3970 
   3971   if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
   3972     Val = Val->getOperand(0);
   3973   }
   3974 
   3975   if (Val->getOpcode() == NVPTXISD::LoadV2 ||
   3976       Val->getOpcode() == NVPTXISD::LoadV4) {
   3977     ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
   3978     if (!MaskCnst) {
   3979       // Not an AND with a constant
   3980       return SDValue();
   3981     }
   3982 
   3983     uint64_t MaskVal = MaskCnst->getZExtValue();
   3984     if (MaskVal != 0xff) {
   3985       // Not an AND that chops off top 8 bits
   3986       return SDValue();
   3987     }
   3988 
   3989     MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
   3990     if (!Mem) {
   3991       // Not a MemSDNode?!?
   3992       return SDValue();
   3993     }
   3994 
   3995     EVT MemVT = Mem->getMemoryVT();
   3996     if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
   3997       // We only handle the i8 case
   3998       return SDValue();
   3999     }
   4000 
   4001     unsigned ExtType =
   4002       cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
   4003         getZExtValue();
   4004     if (ExtType == ISD::SEXTLOAD) {
   4005       // If for some reason the load is a sextload, the and is needed to zero
   4006       // out the high 8 bits
   4007       return SDValue();
   4008     }
   4009 
   4010     bool AddTo = false;
   4011     if (AExt.getNode() != 0) {
   4012       // Re-insert the ext as a zext.
   4013       Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
   4014                             AExt.getValueType(), Val);
   4015       AddTo = true;
   4016     }
   4017 
   4018     // If we get here, the AND is unnecessary.  Just replace it with the load
   4019     DCI.CombineTo(N, Val, AddTo);
   4020   }
   4021 
   4022   return SDValue();
   4023 }
   4024 
   4025 enum OperandSignedness {
   4026   Signed = 0,
   4027   Unsigned,
   4028   Unknown
   4029 };
   4030 
   4031 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
   4032 /// that can be demoted to \p OptSize bits without loss of information. The
   4033 /// signedness of the operand, if determinable, is placed in \p S.
   4034 static bool IsMulWideOperandDemotable(SDValue Op,
   4035                                       unsigned OptSize,
   4036                                       OperandSignedness &S) {
   4037   S = Unknown;
   4038 
   4039   if (Op.getOpcode() == ISD::SIGN_EXTEND ||
   4040       Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
   4041     EVT OrigVT = Op.getOperand(0).getValueType();
   4042     if (OrigVT.getSizeInBits() <= OptSize) {
   4043       S = Signed;
   4044       return true;
   4045     }
   4046   } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
   4047     EVT OrigVT = Op.getOperand(0).getValueType();
   4048     if (OrigVT.getSizeInBits() <= OptSize) {
   4049       S = Unsigned;
   4050       return true;
   4051     }
   4052   }
   4053 
   4054   return false;
   4055 }
   4056 
   4057 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
   4058 /// be demoted to \p OptSize bits without loss of information. If the operands
   4059 /// contain a constant, it should appear as the RHS operand. The signedness of
   4060 /// the operands is placed in \p IsSigned.
   4061 static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
   4062                                         unsigned OptSize,
   4063                                         bool &IsSigned) {
   4064 
   4065   OperandSignedness LHSSign;
   4066 
   4067   // The LHS operand must be a demotable op
   4068   if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
   4069     return false;
   4070 
   4071   // We should have been able to determine the signedness from the LHS
   4072   if (LHSSign == Unknown)
   4073     return false;
   4074 
   4075   IsSigned = (LHSSign == Signed);
   4076 
   4077   // The RHS can be a demotable op or a constant
   4078   if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
   4079     APInt Val = CI->getAPIntValue();
   4080     if (LHSSign == Unsigned) {
   4081       if (Val.isIntN(OptSize)) {
   4082         return true;
   4083       }
   4084       return false;
   4085     } else {
   4086       if (Val.isSignedIntN(OptSize)) {
   4087         return true;
   4088       }
   4089       return false;
   4090     }
   4091   } else {
   4092     OperandSignedness RHSSign;
   4093     if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
   4094       return false;
   4095 
   4096     if (LHSSign != RHSSign)
   4097       return false;
   4098 
   4099     return true;
   4100   }
   4101 }
   4102 
   4103 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
   4104 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
   4105 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
   4106 /// amount.
   4107 static SDValue TryMULWIDECombine(SDNode *N,
   4108                                  TargetLowering::DAGCombinerInfo &DCI) {
   4109   EVT MulType = N->getValueType(0);
   4110   if (MulType != MVT::i32 && MulType != MVT::i64) {
   4111     return SDValue();
   4112   }
   4113 
   4114   unsigned OptSize = MulType.getSizeInBits() >> 1;
   4115   SDValue LHS = N->getOperand(0);
   4116   SDValue RHS = N->getOperand(1);
   4117 
   4118   // Canonicalize the multiply so the constant (if any) is on the right
   4119   if (N->getOpcode() == ISD::MUL) {
   4120     if (isa<ConstantSDNode>(LHS)) {
   4121       std::swap(LHS, RHS);
   4122     }
   4123   }
   4124 
   4125   // If we have a SHL, determine the actual multiply amount
   4126   if (N->getOpcode() == ISD::SHL) {
   4127     ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
   4128     if (!ShlRHS) {
   4129       return SDValue();
   4130     }
   4131 
   4132     APInt ShiftAmt = ShlRHS->getAPIntValue();
   4133     unsigned BitWidth = MulType.getSizeInBits();
   4134     if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
   4135       APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
   4136       RHS = DCI.DAG.getConstant(MulVal, MulType);
   4137     } else {
   4138       return SDValue();
   4139     }
   4140   }
   4141 
   4142   bool Signed;
   4143   // Verify that our operands are demotable
   4144   if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
   4145     return SDValue();
   4146   }
   4147 
   4148   EVT DemotedVT;
   4149   if (MulType == MVT::i32) {
   4150     DemotedVT = MVT::i16;
   4151   } else {
   4152     DemotedVT = MVT::i32;
   4153   }
   4154 
   4155   // Truncate the operands to the correct size. Note that these are just for
   4156   // type consistency and will (likely) be eliminated in later phases.
   4157   SDValue TruncLHS =
   4158     DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, LHS);
   4159   SDValue TruncRHS =
   4160     DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, RHS);
   4161 
   4162   unsigned Opc;
   4163   if (Signed) {
   4164     Opc = NVPTXISD::MUL_WIDE_SIGNED;
   4165   } else {
   4166     Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
   4167   }
   4168 
   4169   return DCI.DAG.getNode(Opc, SDLoc(N), MulType, TruncLHS, TruncRHS);
   4170 }
   4171 
   4172 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
   4173 static SDValue PerformMULCombine(SDNode *N,
   4174                                  TargetLowering::DAGCombinerInfo &DCI,
   4175                                  CodeGenOpt::Level OptLevel) {
   4176   if (OptLevel > 0) {
   4177     // Try mul.wide combining at OptLevel > 0
   4178     SDValue Ret = TryMULWIDECombine(N, DCI);
   4179     if (Ret.getNode())
   4180       return Ret;
   4181   }
   4182 
   4183   return SDValue();
   4184 }
   4185 
   4186 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
   4187 static SDValue PerformSHLCombine(SDNode *N,
   4188                                  TargetLowering::DAGCombinerInfo &DCI,
   4189                                  CodeGenOpt::Level OptLevel) {
   4190   if (OptLevel > 0) {
   4191     // Try mul.wide combining at OptLevel > 0
   4192     SDValue Ret = TryMULWIDECombine(N, DCI);
   4193     if (Ret.getNode())
   4194       return Ret;
   4195   }
   4196 
   4197   return SDValue();
   4198 }
   4199 
   4200 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
   4201                                                DAGCombinerInfo &DCI) const {
   4202   CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel();
   4203   switch (N->getOpcode()) {
   4204     default: break;
   4205     case ISD::ADD:
   4206     case ISD::FADD:
   4207       return PerformADDCombine(N, DCI, STI, OptLevel);
   4208     case ISD::MUL:
   4209       return PerformMULCombine(N, DCI, OptLevel);
   4210     case ISD::SHL:
   4211       return PerformSHLCombine(N, DCI, OptLevel);
   4212     case ISD::AND:
   4213       return PerformANDCombine(N, DCI);
   4214   }
   4215   return SDValue();
   4216 }
   4217 
   4218 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
   4219 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
   4220                               const DataLayout *TD,
   4221                               SmallVectorImpl<SDValue> &Results) {
   4222   EVT ResVT = N->getValueType(0);
   4223   SDLoc DL(N);
   4224 
   4225   assert(ResVT.isVector() && "Vector load must have vector type");
   4226 
   4227   // We only handle "native" vector sizes for now, e.g. <4 x double> is not
   4228   // legal.  We can (and should) split that into 2 loads of <2 x double> here
   4229   // but I'm leaving that as a TODO for now.
   4230   assert(ResVT.isSimple() && "Can only handle simple types");
   4231   switch (ResVT.getSimpleVT().SimpleTy) {
   4232   default:
   4233     return;
   4234   case MVT::v2i8:
   4235   case MVT::v2i16:
   4236   case MVT::v2i32:
   4237   case MVT::v2i64:
   4238   case MVT::v2f32:
   4239   case MVT::v2f64:
   4240   case MVT::v4i8:
   4241   case MVT::v4i16:
   4242   case MVT::v4i32:
   4243   case MVT::v4f32:
   4244     // This is a "native" vector type
   4245     break;
   4246   }
   4247 
   4248   LoadSDNode *LD = cast<LoadSDNode>(N);
   4249 
   4250   unsigned Align = LD->getAlignment();
   4251   unsigned PrefAlign =
   4252     TD->getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
   4253   if (Align < PrefAlign) {
   4254     // This load is not sufficiently aligned, so bail out and let this vector
   4255     // load be scalarized.  Note that we may still be able to emit smaller
   4256     // vector loads.  For example, if we are loading a <4 x float> with an
   4257     // alignment of 8, this check will fail but the legalizer will try again
   4258     // with 2 x <2 x float>, which will succeed with an alignment of 8.
   4259     return;
   4260   }
   4261 
   4262   EVT EltVT = ResVT.getVectorElementType();
   4263   unsigned NumElts = ResVT.getVectorNumElements();
   4264 
   4265   // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
   4266   // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
   4267   // loaded type to i16 and propagate the "real" type as the memory type.
   4268   bool NeedTrunc = false;
   4269   if (EltVT.getSizeInBits() < 16) {
   4270     EltVT = MVT::i16;
   4271     NeedTrunc = true;
   4272   }
   4273 
   4274   unsigned Opcode = 0;
   4275   SDVTList LdResVTs;
   4276 
   4277   switch (NumElts) {
   4278   default:
   4279     return;
   4280   case 2:
   4281     Opcode = NVPTXISD::LoadV2;
   4282     LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
   4283     break;
   4284   case 4: {
   4285     Opcode = NVPTXISD::LoadV4;
   4286     EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
   4287     LdResVTs = DAG.getVTList(ListVTs);
   4288     break;
   4289   }
   4290   }
   4291 
   4292   // Copy regular operands
   4293   SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
   4294 
   4295   // The select routine does not have access to the LoadSDNode instance, so
   4296   // pass along the extension information
   4297   OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType()));
   4298 
   4299   SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
   4300                                           LD->getMemoryVT(),
   4301                                           LD->getMemOperand());
   4302 
   4303   SmallVector<SDValue, 4> ScalarRes;
   4304 
   4305   for (unsigned i = 0; i < NumElts; ++i) {
   4306     SDValue Res = NewLD.getValue(i);
   4307     if (NeedTrunc)
   4308       Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
   4309     ScalarRes.push_back(Res);
   4310   }
   4311 
   4312   SDValue LoadChain = NewLD.getValue(NumElts);
   4313 
   4314   SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
   4315 
   4316   Results.push_back(BuildVec);
   4317   Results.push_back(LoadChain);
   4318 }
   4319 
   4320 static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
   4321                                      SmallVectorImpl<SDValue> &Results) {
   4322   SDValue Chain = N->getOperand(0);
   4323   SDValue Intrin = N->getOperand(1);
   4324   SDLoc DL(N);
   4325 
   4326   // Get the intrinsic ID
   4327   unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
   4328   switch (IntrinNo) {
   4329   default:
   4330     return;
   4331   case Intrinsic::nvvm_ldg_global_i:
   4332   case Intrinsic::nvvm_ldg_global_f:
   4333   case Intrinsic::nvvm_ldg_global_p:
   4334   case Intrinsic::nvvm_ldu_global_i:
   4335   case Intrinsic::nvvm_ldu_global_f:
   4336   case Intrinsic::nvvm_ldu_global_p: {
   4337     EVT ResVT = N->getValueType(0);
   4338 
   4339     if (ResVT.isVector()) {
   4340       // Vector LDG/LDU
   4341 
   4342       unsigned NumElts = ResVT.getVectorNumElements();
   4343       EVT EltVT = ResVT.getVectorElementType();
   4344 
   4345       // Since LDU/LDG are target nodes, we cannot rely on DAG type
   4346       // legalization.
   4347       // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
   4348       // loaded type to i16 and propagate the "real" type as the memory type.
   4349       bool NeedTrunc = false;
   4350       if (EltVT.getSizeInBits() < 16) {
   4351         EltVT = MVT::i16;
   4352         NeedTrunc = true;
   4353       }
   4354 
   4355       unsigned Opcode = 0;
   4356       SDVTList LdResVTs;
   4357 
   4358       switch (NumElts) {
   4359       default:
   4360         return;
   4361       case 2:
   4362         switch (IntrinNo) {
   4363         default:
   4364           return;
   4365         case Intrinsic::nvvm_ldg_global_i:
   4366         case Intrinsic::nvvm_ldg_global_f:
   4367         case Intrinsic::nvvm_ldg_global_p:
   4368           Opcode = NVPTXISD::LDGV2;
   4369           break;
   4370         case Intrinsic::nvvm_ldu_global_i:
   4371         case Intrinsic::nvvm_ldu_global_f:
   4372         case Intrinsic::nvvm_ldu_global_p:
   4373           Opcode = NVPTXISD::LDUV2;
   4374           break;
   4375         }
   4376         LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
   4377         break;
   4378       case 4: {
   4379         switch (IntrinNo) {
   4380         default:
   4381           return;
   4382         case Intrinsic::nvvm_ldg_global_i:
   4383         case Intrinsic::nvvm_ldg_global_f:
   4384         case Intrinsic::nvvm_ldg_global_p:
   4385           Opcode = NVPTXISD::LDGV4;
   4386           break;
   4387         case Intrinsic::nvvm_ldu_global_i:
   4388         case Intrinsic::nvvm_ldu_global_f:
   4389         case Intrinsic::nvvm_ldu_global_p:
   4390           Opcode = NVPTXISD::LDUV4;
   4391           break;
   4392         }
   4393         EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
   4394         LdResVTs = DAG.getVTList(ListVTs);
   4395         break;
   4396       }
   4397       }
   4398 
   4399       SmallVector<SDValue, 8> OtherOps;
   4400 
   4401       // Copy regular operands
   4402 
   4403       OtherOps.push_back(Chain); // Chain
   4404                                  // Skip operand 1 (intrinsic ID)
   4405       // Others
   4406       OtherOps.append(N->op_begin() + 2, N->op_end());
   4407 
   4408       MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
   4409 
   4410       SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
   4411                                               MemSD->getMemoryVT(),
   4412                                               MemSD->getMemOperand());
   4413 
   4414       SmallVector<SDValue, 4> ScalarRes;
   4415 
   4416       for (unsigned i = 0; i < NumElts; ++i) {
   4417         SDValue Res = NewLD.getValue(i);
   4418         if (NeedTrunc)
   4419           Res =
   4420               DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
   4421         ScalarRes.push_back(Res);
   4422       }
   4423 
   4424       SDValue LoadChain = NewLD.getValue(NumElts);
   4425 
   4426       SDValue BuildVec =
   4427           DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
   4428 
   4429       Results.push_back(BuildVec);
   4430       Results.push_back(LoadChain);
   4431     } else {
   4432       // i8 LDG/LDU
   4433       assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
   4434              "Custom handling of non-i8 ldu/ldg?");
   4435 
   4436       // Just copy all operands as-is
   4437       SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
   4438 
   4439       // Force output to i16
   4440       SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
   4441 
   4442       MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
   4443 
   4444       // We make sure the memory type is i8, which will be used during isel
   4445       // to select the proper instruction.
   4446       SDValue NewLD =
   4447           DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
   4448                                   MVT::i8, MemSD->getMemOperand());
   4449 
   4450       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
   4451                                     NewLD.getValue(0)));
   4452       Results.push_back(NewLD.getValue(1));
   4453     }
   4454   }
   4455   }
   4456 }
   4457 
   4458 void NVPTXTargetLowering::ReplaceNodeResults(
   4459     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
   4460   switch (N->getOpcode()) {
   4461   default:
   4462     report_fatal_error("Unhandled custom legalization");
   4463   case ISD::LOAD:
   4464     ReplaceLoadVector(N, DAG, getDataLayout(), Results);
   4465     return;
   4466   case ISD::INTRINSIC_W_CHAIN:
   4467     ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
   4468     return;
   4469   }
   4470 }
   4471 
   4472 // Pin NVPTXSection's and NVPTXTargetObjectFile's vtables to this file.
   4473 void NVPTXSection::anchor() {}
   4474 
   4475 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {
   4476   delete TextSection;
   4477   delete DataSection;
   4478   delete BSSSection;
   4479   delete ReadOnlySection;
   4480 
   4481   delete StaticCtorSection;
   4482   delete StaticDtorSection;
   4483   delete LSDASection;
   4484   delete EHFrameSection;
   4485   delete DwarfAbbrevSection;
   4486   delete DwarfInfoSection;
   4487   delete DwarfLineSection;
   4488   delete DwarfFrameSection;
   4489   delete DwarfPubTypesSection;
   4490   delete DwarfDebugInlineSection;
   4491   delete DwarfStrSection;
   4492   delete DwarfLocSection;
   4493   delete DwarfARangesSection;
   4494   delete DwarfRangesSection;
   4495 }
   4496 
   4497 const MCSection *
   4498 NVPTXTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
   4499                                               SectionKind Kind, Mangler &Mang,
   4500                                               const TargetMachine &TM) const {
   4501   return getDataSection();
   4502 }
   4503