Home | History | Annotate | Download | only in NVPTX
      1 //
      2 //                     The LLVM Compiler Infrastructure
      3 //
      4 // This file is distributed under the University of Illinois Open Source
      5 // License. See LICENSE.TXT for details.
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a
     10 // selection DAG.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 #include "NVPTXISelLowering.h"
     15 #include "NVPTX.h"
     16 #include "NVPTXTargetMachine.h"
     17 #include "NVPTXTargetObjectFile.h"
     18 #include "NVPTXUtilities.h"
     19 #include "llvm/CodeGen/Analysis.h"
     20 #include "llvm/CodeGen/MachineFrameInfo.h"
     21 #include "llvm/CodeGen/MachineFunction.h"
     22 #include "llvm/CodeGen/MachineInstrBuilder.h"
     23 #include "llvm/CodeGen/MachineRegisterInfo.h"
     24 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
     25 #include "llvm/IR/CallSite.h"
     26 #include "llvm/IR/DerivedTypes.h"
     27 #include "llvm/IR/Function.h"
     28 #include "llvm/IR/GlobalValue.h"
     29 #include "llvm/IR/IntrinsicInst.h"
     30 #include "llvm/IR/Intrinsics.h"
     31 #include "llvm/IR/Module.h"
     32 #include "llvm/MC/MCSectionELF.h"
     33 #include "llvm/Support/CommandLine.h"
     34 #include "llvm/Support/Debug.h"
     35 #include "llvm/Support/ErrorHandling.h"
     36 #include "llvm/Support/MathExtras.h"
     37 #include "llvm/Support/raw_ostream.h"
     38 #include <sstream>
     39 
     40 #undef DEBUG_TYPE
     41 #define DEBUG_TYPE "nvptx-lower"
     42 
     43 using namespace llvm;
     44 
     45 static unsigned int uniqueCallSite = 0;
     46 
     47 static cl::opt<bool> sched4reg(
     48     "nvptx-sched4reg",
     49     cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
     50 
     51 static cl::opt<unsigned>
     52 FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
     53                     cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
     54                              " 1: do it  2: do it aggressively"),
     55                     cl::init(2));
     56 
     57 static bool IsPTXVectorType(MVT VT) {
     58   switch (VT.SimpleTy) {
     59   default:
     60     return false;
     61   case MVT::v2i1:
     62   case MVT::v4i1:
     63   case MVT::v2i8:
     64   case MVT::v4i8:
     65   case MVT::v2i16:
     66   case MVT::v4i16:
     67   case MVT::v2i32:
     68   case MVT::v4i32:
     69   case MVT::v2i64:
     70   case MVT::v2f32:
     71   case MVT::v4f32:
     72   case MVT::v2f64:
     73     return true;
     74   }
     75 }
     76 
     77 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
     78 /// EVTs that compose it.  Unlike ComputeValueVTs, this will break apart vectors
     79 /// into their primitive components.
     80 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
     81 /// same number of types as the Ins/Outs arrays in LowerFormalArguments,
     82 /// LowerCall, and LowerReturn.
     83 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
     84                                Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
     85                                SmallVectorImpl<uint64_t> *Offsets = nullptr,
     86                                uint64_t StartingOffset = 0) {
     87   SmallVector<EVT, 16> TempVTs;
     88   SmallVector<uint64_t, 16> TempOffsets;
     89 
     90   ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
     91   for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
     92     EVT VT = TempVTs[i];
     93     uint64_t Off = TempOffsets[i];
     94     if (VT.isVector())
     95       for (unsigned j = 0, je = VT.getVectorNumElements(); j != je; ++j) {
     96         ValueVTs.push_back(VT.getVectorElementType());
     97         if (Offsets)
     98           Offsets->push_back(Off+j*VT.getVectorElementType().getStoreSize());
     99       }
    100     else {
    101       ValueVTs.push_back(VT);
    102       if (Offsets)
    103         Offsets->push_back(Off);
    104     }
    105   }
    106 }
    107 
    108 // NVPTXTargetLowering Constructor.
    109 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
    110                                          const NVPTXSubtarget &STI)
    111     : TargetLowering(TM), nvTM(&TM), STI(STI) {
    112 
    113   // always lower memset, memcpy, and memmove intrinsics to load/store
    114   // instructions, rather
    115   // then generating calls to memset, mempcy or memmove.
    116   MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
    117   MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
    118   MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
    119 
    120   setBooleanContents(ZeroOrNegativeOneBooleanContent);
    121   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
    122 
    123   // Jump is Expensive. Don't create extra control flow for 'and', 'or'
    124   // condition branches.
    125   setJumpIsExpensive(true);
    126 
    127   // Wide divides are _very_ slow. Try to reduce the width of the divide if
    128   // possible.
    129   addBypassSlowDiv(64, 32);
    130 
    131   // By default, use the Source scheduling
    132   if (sched4reg)
    133     setSchedulingPreference(Sched::RegPressure);
    134   else
    135     setSchedulingPreference(Sched::Source);
    136 
    137   addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
    138   addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
    139   addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
    140   addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
    141   addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
    142   addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
    143 
    144   // Operations not directly supported by NVPTX.
    145   setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
    146   setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
    147   setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
    148   setOperationAction(ISD::SELECT_CC, MVT::i8, Expand);
    149   setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
    150   setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
    151   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
    152   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
    153   setOperationAction(ISD::BR_CC, MVT::f64, Expand);
    154   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
    155   setOperationAction(ISD::BR_CC, MVT::i8, Expand);
    156   setOperationAction(ISD::BR_CC, MVT::i16, Expand);
    157   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
    158   setOperationAction(ISD::BR_CC, MVT::i64, Expand);
    159   // Some SIGN_EXTEND_INREG can be done using cvt instruction.
    160   // For others we will expand to a SHL/SRA pair.
    161   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
    162   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
    163   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
    164   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
    165   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
    166 
    167   setOperationAction(ISD::SHL_PARTS, MVT::i32  , Custom);
    168   setOperationAction(ISD::SRA_PARTS, MVT::i32  , Custom);
    169   setOperationAction(ISD::SRL_PARTS, MVT::i32  , Custom);
    170   setOperationAction(ISD::SHL_PARTS, MVT::i64  , Custom);
    171   setOperationAction(ISD::SRA_PARTS, MVT::i64  , Custom);
    172   setOperationAction(ISD::SRL_PARTS, MVT::i64  , Custom);
    173 
    174   if (STI.hasROT64()) {
    175     setOperationAction(ISD::ROTL, MVT::i64, Legal);
    176     setOperationAction(ISD::ROTR, MVT::i64, Legal);
    177   } else {
    178     setOperationAction(ISD::ROTL, MVT::i64, Expand);
    179     setOperationAction(ISD::ROTR, MVT::i64, Expand);
    180   }
    181   if (STI.hasROT32()) {
    182     setOperationAction(ISD::ROTL, MVT::i32, Legal);
    183     setOperationAction(ISD::ROTR, MVT::i32, Legal);
    184   } else {
    185     setOperationAction(ISD::ROTL, MVT::i32, Expand);
    186     setOperationAction(ISD::ROTR, MVT::i32, Expand);
    187   }
    188 
    189   setOperationAction(ISD::ROTL, MVT::i16, Expand);
    190   setOperationAction(ISD::ROTR, MVT::i16, Expand);
    191   setOperationAction(ISD::ROTL, MVT::i8, Expand);
    192   setOperationAction(ISD::ROTR, MVT::i8, Expand);
    193   setOperationAction(ISD::BSWAP, MVT::i16, Expand);
    194   setOperationAction(ISD::BSWAP, MVT::i32, Expand);
    195   setOperationAction(ISD::BSWAP, MVT::i64, Expand);
    196 
    197   // Indirect branch is not supported.
    198   // This also disables Jump Table creation.
    199   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
    200   setOperationAction(ISD::BRIND, MVT::Other, Expand);
    201 
    202   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
    203   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
    204 
    205   // We want to legalize constant related memmove and memcopy
    206   // intrinsics.
    207   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
    208 
    209   // Turn FP extload into load/fextend
    210   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
    211   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
    212   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
    213   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
    214   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
    215   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
    216   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
    217   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
    218   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
    219   // Turn FP truncstore into trunc + store.
    220   // FIXME: vector types should also be expanded
    221   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
    222   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
    223   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
    224 
    225   // PTX does not support load / store predicate registers
    226   setOperationAction(ISD::LOAD, MVT::i1, Custom);
    227   setOperationAction(ISD::STORE, MVT::i1, Custom);
    228 
    229   for (MVT VT : MVT::integer_valuetypes()) {
    230     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
    231     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
    232     setTruncStoreAction(VT, MVT::i1, Expand);
    233   }
    234 
    235   // This is legal in NVPTX
    236   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
    237   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
    238 
    239   // TRAP can be lowered to PTX trap
    240   setOperationAction(ISD::TRAP, MVT::Other, Legal);
    241 
    242   setOperationAction(ISD::ADDC, MVT::i64, Expand);
    243   setOperationAction(ISD::ADDE, MVT::i64, Expand);
    244 
    245   // Register custom handling for vector loads/stores
    246   for (MVT VT : MVT::vector_valuetypes()) {
    247     if (IsPTXVectorType(VT)) {
    248       setOperationAction(ISD::LOAD, VT, Custom);
    249       setOperationAction(ISD::STORE, VT, Custom);
    250       setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
    251     }
    252   }
    253 
    254   // Custom handling for i8 intrinsics
    255   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
    256 
    257   setOperationAction(ISD::CTLZ, MVT::i16, Legal);
    258   setOperationAction(ISD::CTLZ, MVT::i32, Legal);
    259   setOperationAction(ISD::CTLZ, MVT::i64, Legal);
    260   setOperationAction(ISD::CTTZ, MVT::i16, Expand);
    261   setOperationAction(ISD::CTTZ, MVT::i32, Expand);
    262   setOperationAction(ISD::CTTZ, MVT::i64, Expand);
    263   setOperationAction(ISD::CTPOP, MVT::i16, Legal);
    264   setOperationAction(ISD::CTPOP, MVT::i32, Legal);
    265   setOperationAction(ISD::CTPOP, MVT::i64, Legal);
    266 
    267   // PTX does not directly support SELP of i1, so promote to i32 first
    268   setOperationAction(ISD::SELECT, MVT::i1, Custom);
    269 
    270   // PTX cannot multiply two i64s in a single instruction.
    271   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
    272   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
    273 
    274   // We have some custom DAG combine patterns for these nodes
    275   setTargetDAGCombine(ISD::ADD);
    276   setTargetDAGCombine(ISD::AND);
    277   setTargetDAGCombine(ISD::FADD);
    278   setTargetDAGCombine(ISD::MUL);
    279   setTargetDAGCombine(ISD::SHL);
    280   setTargetDAGCombine(ISD::SELECT);
    281 
    282   // Now deduce the information based on the above mentioned
    283   // actions
    284   computeRegisterProperties(STI.getRegisterInfo());
    285 }
    286 
    287 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
    288   switch ((NVPTXISD::NodeType)Opcode) {
    289   case NVPTXISD::FIRST_NUMBER:
    290     break;
    291   case NVPTXISD::CALL:
    292     return "NVPTXISD::CALL";
    293   case NVPTXISD::RET_FLAG:
    294     return "NVPTXISD::RET_FLAG";
    295   case NVPTXISD::LOAD_PARAM:
    296     return "NVPTXISD::LOAD_PARAM";
    297   case NVPTXISD::Wrapper:
    298     return "NVPTXISD::Wrapper";
    299   case NVPTXISD::DeclareParam:
    300     return "NVPTXISD::DeclareParam";
    301   case NVPTXISD::DeclareScalarParam:
    302     return "NVPTXISD::DeclareScalarParam";
    303   case NVPTXISD::DeclareRet:
    304     return "NVPTXISD::DeclareRet";
    305   case NVPTXISD::DeclareScalarRet:
    306     return "NVPTXISD::DeclareScalarRet";
    307   case NVPTXISD::DeclareRetParam:
    308     return "NVPTXISD::DeclareRetParam";
    309   case NVPTXISD::PrintCall:
    310     return "NVPTXISD::PrintCall";
    311   case NVPTXISD::PrintConvergentCall:
    312     return "NVPTXISD::PrintConvergentCall";
    313   case NVPTXISD::PrintCallUni:
    314     return "NVPTXISD::PrintCallUni";
    315   case NVPTXISD::PrintConvergentCallUni:
    316     return "NVPTXISD::PrintConvergentCallUni";
    317   case NVPTXISD::LoadParam:
    318     return "NVPTXISD::LoadParam";
    319   case NVPTXISD::LoadParamV2:
    320     return "NVPTXISD::LoadParamV2";
    321   case NVPTXISD::LoadParamV4:
    322     return "NVPTXISD::LoadParamV4";
    323   case NVPTXISD::StoreParam:
    324     return "NVPTXISD::StoreParam";
    325   case NVPTXISD::StoreParamV2:
    326     return "NVPTXISD::StoreParamV2";
    327   case NVPTXISD::StoreParamV4:
    328     return "NVPTXISD::StoreParamV4";
    329   case NVPTXISD::StoreParamS32:
    330     return "NVPTXISD::StoreParamS32";
    331   case NVPTXISD::StoreParamU32:
    332     return "NVPTXISD::StoreParamU32";
    333   case NVPTXISD::CallArgBegin:
    334     return "NVPTXISD::CallArgBegin";
    335   case NVPTXISD::CallArg:
    336     return "NVPTXISD::CallArg";
    337   case NVPTXISD::LastCallArg:
    338     return "NVPTXISD::LastCallArg";
    339   case NVPTXISD::CallArgEnd:
    340     return "NVPTXISD::CallArgEnd";
    341   case NVPTXISD::CallVoid:
    342     return "NVPTXISD::CallVoid";
    343   case NVPTXISD::CallVal:
    344     return "NVPTXISD::CallVal";
    345   case NVPTXISD::CallSymbol:
    346     return "NVPTXISD::CallSymbol";
    347   case NVPTXISD::Prototype:
    348     return "NVPTXISD::Prototype";
    349   case NVPTXISD::MoveParam:
    350     return "NVPTXISD::MoveParam";
    351   case NVPTXISD::StoreRetval:
    352     return "NVPTXISD::StoreRetval";
    353   case NVPTXISD::StoreRetvalV2:
    354     return "NVPTXISD::StoreRetvalV2";
    355   case NVPTXISD::StoreRetvalV4:
    356     return "NVPTXISD::StoreRetvalV4";
    357   case NVPTXISD::PseudoUseParam:
    358     return "NVPTXISD::PseudoUseParam";
    359   case NVPTXISD::RETURN:
    360     return "NVPTXISD::RETURN";
    361   case NVPTXISD::CallSeqBegin:
    362     return "NVPTXISD::CallSeqBegin";
    363   case NVPTXISD::CallSeqEnd:
    364     return "NVPTXISD::CallSeqEnd";
    365   case NVPTXISD::CallPrototype:
    366     return "NVPTXISD::CallPrototype";
    367   case NVPTXISD::LoadV2:
    368     return "NVPTXISD::LoadV2";
    369   case NVPTXISD::LoadV4:
    370     return "NVPTXISD::LoadV4";
    371   case NVPTXISD::LDGV2:
    372     return "NVPTXISD::LDGV2";
    373   case NVPTXISD::LDGV4:
    374     return "NVPTXISD::LDGV4";
    375   case NVPTXISD::LDUV2:
    376     return "NVPTXISD::LDUV2";
    377   case NVPTXISD::LDUV4:
    378     return "NVPTXISD::LDUV4";
    379   case NVPTXISD::StoreV2:
    380     return "NVPTXISD::StoreV2";
    381   case NVPTXISD::StoreV4:
    382     return "NVPTXISD::StoreV4";
    383   case NVPTXISD::FUN_SHFL_CLAMP:
    384     return "NVPTXISD::FUN_SHFL_CLAMP";
    385   case NVPTXISD::FUN_SHFR_CLAMP:
    386     return "NVPTXISD::FUN_SHFR_CLAMP";
    387   case NVPTXISD::IMAD:
    388     return "NVPTXISD::IMAD";
    389   case NVPTXISD::Dummy:
    390     return "NVPTXISD::Dummy";
    391   case NVPTXISD::MUL_WIDE_SIGNED:
    392     return "NVPTXISD::MUL_WIDE_SIGNED";
    393   case NVPTXISD::MUL_WIDE_UNSIGNED:
    394     return "NVPTXISD::MUL_WIDE_UNSIGNED";
    395   case NVPTXISD::Tex1DFloatS32:        return "NVPTXISD::Tex1DFloatS32";
    396   case NVPTXISD::Tex1DFloatFloat:      return "NVPTXISD::Tex1DFloatFloat";
    397   case NVPTXISD::Tex1DFloatFloatLevel:
    398     return "NVPTXISD::Tex1DFloatFloatLevel";
    399   case NVPTXISD::Tex1DFloatFloatGrad:
    400     return "NVPTXISD::Tex1DFloatFloatGrad";
    401   case NVPTXISD::Tex1DS32S32:          return "NVPTXISD::Tex1DS32S32";
    402   case NVPTXISD::Tex1DS32Float:        return "NVPTXISD::Tex1DS32Float";
    403   case NVPTXISD::Tex1DS32FloatLevel:
    404     return "NVPTXISD::Tex1DS32FloatLevel";
    405   case NVPTXISD::Tex1DS32FloatGrad:
    406     return "NVPTXISD::Tex1DS32FloatGrad";
    407   case NVPTXISD::Tex1DU32S32:          return "NVPTXISD::Tex1DU32S32";
    408   case NVPTXISD::Tex1DU32Float:        return "NVPTXISD::Tex1DU32Float";
    409   case NVPTXISD::Tex1DU32FloatLevel:
    410     return "NVPTXISD::Tex1DU32FloatLevel";
    411   case NVPTXISD::Tex1DU32FloatGrad:
    412     return "NVPTXISD::Tex1DU32FloatGrad";
    413   case NVPTXISD::Tex1DArrayFloatS32:   return "NVPTXISD::Tex1DArrayFloatS32";
    414   case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
    415   case NVPTXISD::Tex1DArrayFloatFloatLevel:
    416     return "NVPTXISD::Tex1DArrayFloatFloatLevel";
    417   case NVPTXISD::Tex1DArrayFloatFloatGrad:
    418     return "NVPTXISD::Tex1DArrayFloatFloatGrad";
    419   case NVPTXISD::Tex1DArrayS32S32:     return "NVPTXISD::Tex1DArrayS32S32";
    420   case NVPTXISD::Tex1DArrayS32Float:   return "NVPTXISD::Tex1DArrayS32Float";
    421   case NVPTXISD::Tex1DArrayS32FloatLevel:
    422     return "NVPTXISD::Tex1DArrayS32FloatLevel";
    423   case NVPTXISD::Tex1DArrayS32FloatGrad:
    424     return "NVPTXISD::Tex1DArrayS32FloatGrad";
    425   case NVPTXISD::Tex1DArrayU32S32:     return "NVPTXISD::Tex1DArrayU32S32";
    426   case NVPTXISD::Tex1DArrayU32Float:   return "NVPTXISD::Tex1DArrayU32Float";
    427   case NVPTXISD::Tex1DArrayU32FloatLevel:
    428     return "NVPTXISD::Tex1DArrayU32FloatLevel";
    429   case NVPTXISD::Tex1DArrayU32FloatGrad:
    430     return "NVPTXISD::Tex1DArrayU32FloatGrad";
    431   case NVPTXISD::Tex2DFloatS32:        return "NVPTXISD::Tex2DFloatS32";
    432   case NVPTXISD::Tex2DFloatFloat:      return "NVPTXISD::Tex2DFloatFloat";
    433   case NVPTXISD::Tex2DFloatFloatLevel:
    434     return "NVPTXISD::Tex2DFloatFloatLevel";
    435   case NVPTXISD::Tex2DFloatFloatGrad:
    436     return "NVPTXISD::Tex2DFloatFloatGrad";
    437   case NVPTXISD::Tex2DS32S32:          return "NVPTXISD::Tex2DS32S32";
    438   case NVPTXISD::Tex2DS32Float:        return "NVPTXISD::Tex2DS32Float";
    439   case NVPTXISD::Tex2DS32FloatLevel:
    440     return "NVPTXISD::Tex2DS32FloatLevel";
    441   case NVPTXISD::Tex2DS32FloatGrad:
    442     return "NVPTXISD::Tex2DS32FloatGrad";
    443   case NVPTXISD::Tex2DU32S32:          return "NVPTXISD::Tex2DU32S32";
    444   case NVPTXISD::Tex2DU32Float:        return "NVPTXISD::Tex2DU32Float";
    445   case NVPTXISD::Tex2DU32FloatLevel:
    446     return "NVPTXISD::Tex2DU32FloatLevel";
    447   case NVPTXISD::Tex2DU32FloatGrad:
    448     return "NVPTXISD::Tex2DU32FloatGrad";
    449   case NVPTXISD::Tex2DArrayFloatS32:   return "NVPTXISD::Tex2DArrayFloatS32";
    450   case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
    451   case NVPTXISD::Tex2DArrayFloatFloatLevel:
    452     return "NVPTXISD::Tex2DArrayFloatFloatLevel";
    453   case NVPTXISD::Tex2DArrayFloatFloatGrad:
    454     return "NVPTXISD::Tex2DArrayFloatFloatGrad";
    455   case NVPTXISD::Tex2DArrayS32S32:     return "NVPTXISD::Tex2DArrayS32S32";
    456   case NVPTXISD::Tex2DArrayS32Float:   return "NVPTXISD::Tex2DArrayS32Float";
    457   case NVPTXISD::Tex2DArrayS32FloatLevel:
    458     return "NVPTXISD::Tex2DArrayS32FloatLevel";
    459   case NVPTXISD::Tex2DArrayS32FloatGrad:
    460     return "NVPTXISD::Tex2DArrayS32FloatGrad";
    461   case NVPTXISD::Tex2DArrayU32S32:     return "NVPTXISD::Tex2DArrayU32S32";
    462   case NVPTXISD::Tex2DArrayU32Float:   return "NVPTXISD::Tex2DArrayU32Float";
    463   case NVPTXISD::Tex2DArrayU32FloatLevel:
    464     return "NVPTXISD::Tex2DArrayU32FloatLevel";
    465   case NVPTXISD::Tex2DArrayU32FloatGrad:
    466     return "NVPTXISD::Tex2DArrayU32FloatGrad";
    467   case NVPTXISD::Tex3DFloatS32:        return "NVPTXISD::Tex3DFloatS32";
    468   case NVPTXISD::Tex3DFloatFloat:      return "NVPTXISD::Tex3DFloatFloat";
    469   case NVPTXISD::Tex3DFloatFloatLevel:
    470     return "NVPTXISD::Tex3DFloatFloatLevel";
    471   case NVPTXISD::Tex3DFloatFloatGrad:
    472     return "NVPTXISD::Tex3DFloatFloatGrad";
    473   case NVPTXISD::Tex3DS32S32:          return "NVPTXISD::Tex3DS32S32";
    474   case NVPTXISD::Tex3DS32Float:        return "NVPTXISD::Tex3DS32Float";
    475   case NVPTXISD::Tex3DS32FloatLevel:
    476     return "NVPTXISD::Tex3DS32FloatLevel";
    477   case NVPTXISD::Tex3DS32FloatGrad:
    478     return "NVPTXISD::Tex3DS32FloatGrad";
    479   case NVPTXISD::Tex3DU32S32:          return "NVPTXISD::Tex3DU32S32";
    480   case NVPTXISD::Tex3DU32Float:        return "NVPTXISD::Tex3DU32Float";
    481   case NVPTXISD::Tex3DU32FloatLevel:
    482     return "NVPTXISD::Tex3DU32FloatLevel";
    483   case NVPTXISD::Tex3DU32FloatGrad:
    484     return "NVPTXISD::Tex3DU32FloatGrad";
    485   case NVPTXISD::TexCubeFloatFloat:      return "NVPTXISD::TexCubeFloatFloat";
    486   case NVPTXISD::TexCubeFloatFloatLevel:
    487     return "NVPTXISD::TexCubeFloatFloatLevel";
    488   case NVPTXISD::TexCubeS32Float:        return "NVPTXISD::TexCubeS32Float";
    489   case NVPTXISD::TexCubeS32FloatLevel:
    490     return "NVPTXISD::TexCubeS32FloatLevel";
    491   case NVPTXISD::TexCubeU32Float:        return "NVPTXISD::TexCubeU32Float";
    492   case NVPTXISD::TexCubeU32FloatLevel:
    493     return "NVPTXISD::TexCubeU32FloatLevel";
    494   case NVPTXISD::TexCubeArrayFloatFloat:
    495     return "NVPTXISD::TexCubeArrayFloatFloat";
    496   case NVPTXISD::TexCubeArrayFloatFloatLevel:
    497     return "NVPTXISD::TexCubeArrayFloatFloatLevel";
    498   case NVPTXISD::TexCubeArrayS32Float:
    499     return "NVPTXISD::TexCubeArrayS32Float";
    500   case NVPTXISD::TexCubeArrayS32FloatLevel:
    501     return "NVPTXISD::TexCubeArrayS32FloatLevel";
    502   case NVPTXISD::TexCubeArrayU32Float:
    503     return "NVPTXISD::TexCubeArrayU32Float";
    504   case NVPTXISD::TexCubeArrayU32FloatLevel:
    505     return "NVPTXISD::TexCubeArrayU32FloatLevel";
    506   case NVPTXISD::Tld4R2DFloatFloat:
    507     return "NVPTXISD::Tld4R2DFloatFloat";
    508   case NVPTXISD::Tld4G2DFloatFloat:
    509     return "NVPTXISD::Tld4G2DFloatFloat";
    510   case NVPTXISD::Tld4B2DFloatFloat:
    511     return "NVPTXISD::Tld4B2DFloatFloat";
    512   case NVPTXISD::Tld4A2DFloatFloat:
    513     return "NVPTXISD::Tld4A2DFloatFloat";
    514   case NVPTXISD::Tld4R2DS64Float:
    515     return "NVPTXISD::Tld4R2DS64Float";
    516   case NVPTXISD::Tld4G2DS64Float:
    517     return "NVPTXISD::Tld4G2DS64Float";
    518   case NVPTXISD::Tld4B2DS64Float:
    519     return "NVPTXISD::Tld4B2DS64Float";
    520   case NVPTXISD::Tld4A2DS64Float:
    521     return "NVPTXISD::Tld4A2DS64Float";
    522   case NVPTXISD::Tld4R2DU64Float:
    523     return "NVPTXISD::Tld4R2DU64Float";
    524   case NVPTXISD::Tld4G2DU64Float:
    525     return "NVPTXISD::Tld4G2DU64Float";
    526   case NVPTXISD::Tld4B2DU64Float:
    527     return "NVPTXISD::Tld4B2DU64Float";
    528   case NVPTXISD::Tld4A2DU64Float:
    529     return "NVPTXISD::Tld4A2DU64Float";
    530 
    531   case NVPTXISD::TexUnified1DFloatS32:
    532     return "NVPTXISD::TexUnified1DFloatS32";
    533   case NVPTXISD::TexUnified1DFloatFloat:
    534     return "NVPTXISD::TexUnified1DFloatFloat";
    535   case NVPTXISD::TexUnified1DFloatFloatLevel:
    536     return "NVPTXISD::TexUnified1DFloatFloatLevel";
    537   case NVPTXISD::TexUnified1DFloatFloatGrad:
    538     return "NVPTXISD::TexUnified1DFloatFloatGrad";
    539   case NVPTXISD::TexUnified1DS32S32:
    540     return "NVPTXISD::TexUnified1DS32S32";
    541   case NVPTXISD::TexUnified1DS32Float:
    542     return "NVPTXISD::TexUnified1DS32Float";
    543   case NVPTXISD::TexUnified1DS32FloatLevel:
    544     return "NVPTXISD::TexUnified1DS32FloatLevel";
    545   case NVPTXISD::TexUnified1DS32FloatGrad:
    546     return "NVPTXISD::TexUnified1DS32FloatGrad";
    547   case NVPTXISD::TexUnified1DU32S32:
    548     return "NVPTXISD::TexUnified1DU32S32";
    549   case NVPTXISD::TexUnified1DU32Float:
    550     return "NVPTXISD::TexUnified1DU32Float";
    551   case NVPTXISD::TexUnified1DU32FloatLevel:
    552     return "NVPTXISD::TexUnified1DU32FloatLevel";
    553   case NVPTXISD::TexUnified1DU32FloatGrad:
    554     return "NVPTXISD::TexUnified1DU32FloatGrad";
    555   case NVPTXISD::TexUnified1DArrayFloatS32:
    556     return "NVPTXISD::TexUnified1DArrayFloatS32";
    557   case NVPTXISD::TexUnified1DArrayFloatFloat:
    558     return "NVPTXISD::TexUnified1DArrayFloatFloat";
    559   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
    560     return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
    561   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
    562     return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
    563   case NVPTXISD::TexUnified1DArrayS32S32:
    564     return "NVPTXISD::TexUnified1DArrayS32S32";
    565   case NVPTXISD::TexUnified1DArrayS32Float:
    566     return "NVPTXISD::TexUnified1DArrayS32Float";
    567   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
    568     return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
    569   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
    570     return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
    571   case NVPTXISD::TexUnified1DArrayU32S32:
    572     return "NVPTXISD::TexUnified1DArrayU32S32";
    573   case NVPTXISD::TexUnified1DArrayU32Float:
    574     return "NVPTXISD::TexUnified1DArrayU32Float";
    575   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
    576     return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
    577   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
    578     return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
    579   case NVPTXISD::TexUnified2DFloatS32:
    580     return "NVPTXISD::TexUnified2DFloatS32";
    581   case NVPTXISD::TexUnified2DFloatFloat:
    582     return "NVPTXISD::TexUnified2DFloatFloat";
    583   case NVPTXISD::TexUnified2DFloatFloatLevel:
    584     return "NVPTXISD::TexUnified2DFloatFloatLevel";
    585   case NVPTXISD::TexUnified2DFloatFloatGrad:
    586     return "NVPTXISD::TexUnified2DFloatFloatGrad";
    587   case NVPTXISD::TexUnified2DS32S32:
    588     return "NVPTXISD::TexUnified2DS32S32";
    589   case NVPTXISD::TexUnified2DS32Float:
    590     return "NVPTXISD::TexUnified2DS32Float";
    591   case NVPTXISD::TexUnified2DS32FloatLevel:
    592     return "NVPTXISD::TexUnified2DS32FloatLevel";
    593   case NVPTXISD::TexUnified2DS32FloatGrad:
    594     return "NVPTXISD::TexUnified2DS32FloatGrad";
    595   case NVPTXISD::TexUnified2DU32S32:
    596     return "NVPTXISD::TexUnified2DU32S32";
    597   case NVPTXISD::TexUnified2DU32Float:
    598     return "NVPTXISD::TexUnified2DU32Float";
    599   case NVPTXISD::TexUnified2DU32FloatLevel:
    600     return "NVPTXISD::TexUnified2DU32FloatLevel";
    601   case NVPTXISD::TexUnified2DU32FloatGrad:
    602     return "NVPTXISD::TexUnified2DU32FloatGrad";
    603   case NVPTXISD::TexUnified2DArrayFloatS32:
    604     return "NVPTXISD::TexUnified2DArrayFloatS32";
    605   case NVPTXISD::TexUnified2DArrayFloatFloat:
    606     return "NVPTXISD::TexUnified2DArrayFloatFloat";
    607   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
    608     return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
    609   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
    610     return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
    611   case NVPTXISD::TexUnified2DArrayS32S32:
    612     return "NVPTXISD::TexUnified2DArrayS32S32";
    613   case NVPTXISD::TexUnified2DArrayS32Float:
    614     return "NVPTXISD::TexUnified2DArrayS32Float";
    615   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
    616     return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
    617   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
    618     return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
    619   case NVPTXISD::TexUnified2DArrayU32S32:
    620     return "NVPTXISD::TexUnified2DArrayU32S32";
    621   case NVPTXISD::TexUnified2DArrayU32Float:
    622     return "NVPTXISD::TexUnified2DArrayU32Float";
    623   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
    624     return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
    625   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
    626     return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
    627   case NVPTXISD::TexUnified3DFloatS32:
    628     return "NVPTXISD::TexUnified3DFloatS32";
    629   case NVPTXISD::TexUnified3DFloatFloat:
    630     return "NVPTXISD::TexUnified3DFloatFloat";
    631   case NVPTXISD::TexUnified3DFloatFloatLevel:
    632     return "NVPTXISD::TexUnified3DFloatFloatLevel";
    633   case NVPTXISD::TexUnified3DFloatFloatGrad:
    634     return "NVPTXISD::TexUnified3DFloatFloatGrad";
    635   case NVPTXISD::TexUnified3DS32S32:
    636     return "NVPTXISD::TexUnified3DS32S32";
    637   case NVPTXISD::TexUnified3DS32Float:
    638     return "NVPTXISD::TexUnified3DS32Float";
    639   case NVPTXISD::TexUnified3DS32FloatLevel:
    640     return "NVPTXISD::TexUnified3DS32FloatLevel";
    641   case NVPTXISD::TexUnified3DS32FloatGrad:
    642     return "NVPTXISD::TexUnified3DS32FloatGrad";
    643   case NVPTXISD::TexUnified3DU32S32:
    644     return "NVPTXISD::TexUnified3DU32S32";
    645   case NVPTXISD::TexUnified3DU32Float:
    646     return "NVPTXISD::TexUnified3DU32Float";
    647   case NVPTXISD::TexUnified3DU32FloatLevel:
    648     return "NVPTXISD::TexUnified3DU32FloatLevel";
    649   case NVPTXISD::TexUnified3DU32FloatGrad:
    650     return "NVPTXISD::TexUnified3DU32FloatGrad";
    651   case NVPTXISD::TexUnifiedCubeFloatFloat:
    652     return "NVPTXISD::TexUnifiedCubeFloatFloat";
    653   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
    654     return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
    655   case NVPTXISD::TexUnifiedCubeS32Float:
    656     return "NVPTXISD::TexUnifiedCubeS32Float";
    657   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
    658     return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
    659   case NVPTXISD::TexUnifiedCubeU32Float:
    660     return "NVPTXISD::TexUnifiedCubeU32Float";
    661   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
    662     return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
    663   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
    664     return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
    665   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
    666     return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
    667   case NVPTXISD::TexUnifiedCubeArrayS32Float:
    668     return "NVPTXISD::TexUnifiedCubeArrayS32Float";
    669   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
    670     return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
    671   case NVPTXISD::TexUnifiedCubeArrayU32Float:
    672     return "NVPTXISD::TexUnifiedCubeArrayU32Float";
    673   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
    674     return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
    675   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
    676     return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
    677   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
    678     return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
    679   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
    680     return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
    681   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
    682     return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
    683   case NVPTXISD::Tld4UnifiedR2DS64Float:
    684     return "NVPTXISD::Tld4UnifiedR2DS64Float";
    685   case NVPTXISD::Tld4UnifiedG2DS64Float:
    686     return "NVPTXISD::Tld4UnifiedG2DS64Float";
    687   case NVPTXISD::Tld4UnifiedB2DS64Float:
    688     return "NVPTXISD::Tld4UnifiedB2DS64Float";
    689   case NVPTXISD::Tld4UnifiedA2DS64Float:
    690     return "NVPTXISD::Tld4UnifiedA2DS64Float";
    691   case NVPTXISD::Tld4UnifiedR2DU64Float:
    692     return "NVPTXISD::Tld4UnifiedR2DU64Float";
    693   case NVPTXISD::Tld4UnifiedG2DU64Float:
    694     return "NVPTXISD::Tld4UnifiedG2DU64Float";
    695   case NVPTXISD::Tld4UnifiedB2DU64Float:
    696     return "NVPTXISD::Tld4UnifiedB2DU64Float";
    697   case NVPTXISD::Tld4UnifiedA2DU64Float:
    698     return "NVPTXISD::Tld4UnifiedA2DU64Float";
    699 
    700   case NVPTXISD::Suld1DI8Clamp:          return "NVPTXISD::Suld1DI8Clamp";
    701   case NVPTXISD::Suld1DI16Clamp:         return "NVPTXISD::Suld1DI16Clamp";
    702   case NVPTXISD::Suld1DI32Clamp:         return "NVPTXISD::Suld1DI32Clamp";
    703   case NVPTXISD::Suld1DI64Clamp:         return "NVPTXISD::Suld1DI64Clamp";
    704   case NVPTXISD::Suld1DV2I8Clamp:        return "NVPTXISD::Suld1DV2I8Clamp";
    705   case NVPTXISD::Suld1DV2I16Clamp:       return "NVPTXISD::Suld1DV2I16Clamp";
    706   case NVPTXISD::Suld1DV2I32Clamp:       return "NVPTXISD::Suld1DV2I32Clamp";
    707   case NVPTXISD::Suld1DV2I64Clamp:       return "NVPTXISD::Suld1DV2I64Clamp";
    708   case NVPTXISD::Suld1DV4I8Clamp:        return "NVPTXISD::Suld1DV4I8Clamp";
    709   case NVPTXISD::Suld1DV4I16Clamp:       return "NVPTXISD::Suld1DV4I16Clamp";
    710   case NVPTXISD::Suld1DV4I32Clamp:       return "NVPTXISD::Suld1DV4I32Clamp";
    711 
    712   case NVPTXISD::Suld1DArrayI8Clamp:   return "NVPTXISD::Suld1DArrayI8Clamp";
    713   case NVPTXISD::Suld1DArrayI16Clamp:  return "NVPTXISD::Suld1DArrayI16Clamp";
    714   case NVPTXISD::Suld1DArrayI32Clamp:  return "NVPTXISD::Suld1DArrayI32Clamp";
    715   case NVPTXISD::Suld1DArrayI64Clamp:  return "NVPTXISD::Suld1DArrayI64Clamp";
    716   case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
    717   case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
    718   case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
    719   case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
    720   case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
    721   case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
    722   case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
    723 
    724   case NVPTXISD::Suld2DI8Clamp:          return "NVPTXISD::Suld2DI8Clamp";
    725   case NVPTXISD::Suld2DI16Clamp:         return "NVPTXISD::Suld2DI16Clamp";
    726   case NVPTXISD::Suld2DI32Clamp:         return "NVPTXISD::Suld2DI32Clamp";
    727   case NVPTXISD::Suld2DI64Clamp:         return "NVPTXISD::Suld2DI64Clamp";
    728   case NVPTXISD::Suld2DV2I8Clamp:        return "NVPTXISD::Suld2DV2I8Clamp";
    729   case NVPTXISD::Suld2DV2I16Clamp:       return "NVPTXISD::Suld2DV2I16Clamp";
    730   case NVPTXISD::Suld2DV2I32Clamp:       return "NVPTXISD::Suld2DV2I32Clamp";
    731   case NVPTXISD::Suld2DV2I64Clamp:       return "NVPTXISD::Suld2DV2I64Clamp";
    732   case NVPTXISD::Suld2DV4I8Clamp:        return "NVPTXISD::Suld2DV4I8Clamp";
    733   case NVPTXISD::Suld2DV4I16Clamp:       return "NVPTXISD::Suld2DV4I16Clamp";
    734   case NVPTXISD::Suld2DV4I32Clamp:       return "NVPTXISD::Suld2DV4I32Clamp";
    735 
    736   case NVPTXISD::Suld2DArrayI8Clamp:   return "NVPTXISD::Suld2DArrayI8Clamp";
    737   case NVPTXISD::Suld2DArrayI16Clamp:  return "NVPTXISD::Suld2DArrayI16Clamp";
    738   case NVPTXISD::Suld2DArrayI32Clamp:  return "NVPTXISD::Suld2DArrayI32Clamp";
    739   case NVPTXISD::Suld2DArrayI64Clamp:  return "NVPTXISD::Suld2DArrayI64Clamp";
    740   case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
    741   case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
    742   case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
    743   case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
    744   case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
    745   case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
    746   case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
    747 
    748   case NVPTXISD::Suld3DI8Clamp:          return "NVPTXISD::Suld3DI8Clamp";
    749   case NVPTXISD::Suld3DI16Clamp:         return "NVPTXISD::Suld3DI16Clamp";
    750   case NVPTXISD::Suld3DI32Clamp:         return "NVPTXISD::Suld3DI32Clamp";
    751   case NVPTXISD::Suld3DI64Clamp:         return "NVPTXISD::Suld3DI64Clamp";
    752   case NVPTXISD::Suld3DV2I8Clamp:        return "NVPTXISD::Suld3DV2I8Clamp";
    753   case NVPTXISD::Suld3DV2I16Clamp:       return "NVPTXISD::Suld3DV2I16Clamp";
    754   case NVPTXISD::Suld3DV2I32Clamp:       return "NVPTXISD::Suld3DV2I32Clamp";
    755   case NVPTXISD::Suld3DV2I64Clamp:       return "NVPTXISD::Suld3DV2I64Clamp";
    756   case NVPTXISD::Suld3DV4I8Clamp:        return "NVPTXISD::Suld3DV4I8Clamp";
    757   case NVPTXISD::Suld3DV4I16Clamp:       return "NVPTXISD::Suld3DV4I16Clamp";
    758   case NVPTXISD::Suld3DV4I32Clamp:       return "NVPTXISD::Suld3DV4I32Clamp";
    759 
    760   case NVPTXISD::Suld1DI8Trap:          return "NVPTXISD::Suld1DI8Trap";
    761   case NVPTXISD::Suld1DI16Trap:         return "NVPTXISD::Suld1DI16Trap";
    762   case NVPTXISD::Suld1DI32Trap:         return "NVPTXISD::Suld1DI32Trap";
    763   case NVPTXISD::Suld1DI64Trap:         return "NVPTXISD::Suld1DI64Trap";
    764   case NVPTXISD::Suld1DV2I8Trap:        return "NVPTXISD::Suld1DV2I8Trap";
    765   case NVPTXISD::Suld1DV2I16Trap:       return "NVPTXISD::Suld1DV2I16Trap";
    766   case NVPTXISD::Suld1DV2I32Trap:       return "NVPTXISD::Suld1DV2I32Trap";
    767   case NVPTXISD::Suld1DV2I64Trap:       return "NVPTXISD::Suld1DV2I64Trap";
    768   case NVPTXISD::Suld1DV4I8Trap:        return "NVPTXISD::Suld1DV4I8Trap";
    769   case NVPTXISD::Suld1DV4I16Trap:       return "NVPTXISD::Suld1DV4I16Trap";
    770   case NVPTXISD::Suld1DV4I32Trap:       return "NVPTXISD::Suld1DV4I32Trap";
    771 
    772   case NVPTXISD::Suld1DArrayI8Trap:     return "NVPTXISD::Suld1DArrayI8Trap";
    773   case NVPTXISD::Suld1DArrayI16Trap:    return "NVPTXISD::Suld1DArrayI16Trap";
    774   case NVPTXISD::Suld1DArrayI32Trap:    return "NVPTXISD::Suld1DArrayI32Trap";
    775   case NVPTXISD::Suld1DArrayI64Trap:    return "NVPTXISD::Suld1DArrayI64Trap";
    776   case NVPTXISD::Suld1DArrayV2I8Trap:   return "NVPTXISD::Suld1DArrayV2I8Trap";
    777   case NVPTXISD::Suld1DArrayV2I16Trap:  return "NVPTXISD::Suld1DArrayV2I16Trap";
    778   case NVPTXISD::Suld1DArrayV2I32Trap:  return "NVPTXISD::Suld1DArrayV2I32Trap";
    779   case NVPTXISD::Suld1DArrayV2I64Trap:  return "NVPTXISD::Suld1DArrayV2I64Trap";
    780   case NVPTXISD::Suld1DArrayV4I8Trap:   return "NVPTXISD::Suld1DArrayV4I8Trap";
    781   case NVPTXISD::Suld1DArrayV4I16Trap:  return "NVPTXISD::Suld1DArrayV4I16Trap";
    782   case NVPTXISD::Suld1DArrayV4I32Trap:  return "NVPTXISD::Suld1DArrayV4I32Trap";
    783 
    784   case NVPTXISD::Suld2DI8Trap:          return "NVPTXISD::Suld2DI8Trap";
    785   case NVPTXISD::Suld2DI16Trap:         return "NVPTXISD::Suld2DI16Trap";
    786   case NVPTXISD::Suld2DI32Trap:         return "NVPTXISD::Suld2DI32Trap";
    787   case NVPTXISD::Suld2DI64Trap:         return "NVPTXISD::Suld2DI64Trap";
    788   case NVPTXISD::Suld2DV2I8Trap:        return "NVPTXISD::Suld2DV2I8Trap";
    789   case NVPTXISD::Suld2DV2I16Trap:       return "NVPTXISD::Suld2DV2I16Trap";
    790   case NVPTXISD::Suld2DV2I32Trap:       return "NVPTXISD::Suld2DV2I32Trap";
    791   case NVPTXISD::Suld2DV2I64Trap:       return "NVPTXISD::Suld2DV2I64Trap";
    792   case NVPTXISD::Suld2DV4I8Trap:        return "NVPTXISD::Suld2DV4I8Trap";
    793   case NVPTXISD::Suld2DV4I16Trap:       return "NVPTXISD::Suld2DV4I16Trap";
    794   case NVPTXISD::Suld2DV4I32Trap:       return "NVPTXISD::Suld2DV4I32Trap";
    795 
    796   case NVPTXISD::Suld2DArrayI8Trap:     return "NVPTXISD::Suld2DArrayI8Trap";
    797   case NVPTXISD::Suld2DArrayI16Trap:    return "NVPTXISD::Suld2DArrayI16Trap";
    798   case NVPTXISD::Suld2DArrayI32Trap:    return "NVPTXISD::Suld2DArrayI32Trap";
    799   case NVPTXISD::Suld2DArrayI64Trap:    return "NVPTXISD::Suld2DArrayI64Trap";
    800   case NVPTXISD::Suld2DArrayV2I8Trap:   return "NVPTXISD::Suld2DArrayV2I8Trap";
    801   case NVPTXISD::Suld2DArrayV2I16Trap:  return "NVPTXISD::Suld2DArrayV2I16Trap";
    802   case NVPTXISD::Suld2DArrayV2I32Trap:  return "NVPTXISD::Suld2DArrayV2I32Trap";
    803   case NVPTXISD::Suld2DArrayV2I64Trap:  return "NVPTXISD::Suld2DArrayV2I64Trap";
    804   case NVPTXISD::Suld2DArrayV4I8Trap:   return "NVPTXISD::Suld2DArrayV4I8Trap";
    805   case NVPTXISD::Suld2DArrayV4I16Trap:  return "NVPTXISD::Suld2DArrayV4I16Trap";
    806   case NVPTXISD::Suld2DArrayV4I32Trap:  return "NVPTXISD::Suld2DArrayV4I32Trap";
    807 
    808   case NVPTXISD::Suld3DI8Trap:          return "NVPTXISD::Suld3DI8Trap";
    809   case NVPTXISD::Suld3DI16Trap:         return "NVPTXISD::Suld3DI16Trap";
    810   case NVPTXISD::Suld3DI32Trap:         return "NVPTXISD::Suld3DI32Trap";
    811   case NVPTXISD::Suld3DI64Trap:         return "NVPTXISD::Suld3DI64Trap";
    812   case NVPTXISD::Suld3DV2I8Trap:        return "NVPTXISD::Suld3DV2I8Trap";
    813   case NVPTXISD::Suld3DV2I16Trap:       return "NVPTXISD::Suld3DV2I16Trap";
    814   case NVPTXISD::Suld3DV2I32Trap:       return "NVPTXISD::Suld3DV2I32Trap";
    815   case NVPTXISD::Suld3DV2I64Trap:       return "NVPTXISD::Suld3DV2I64Trap";
    816   case NVPTXISD::Suld3DV4I8Trap:        return "NVPTXISD::Suld3DV4I8Trap";
    817   case NVPTXISD::Suld3DV4I16Trap:       return "NVPTXISD::Suld3DV4I16Trap";
    818   case NVPTXISD::Suld3DV4I32Trap:       return "NVPTXISD::Suld3DV4I32Trap";
    819 
    820   case NVPTXISD::Suld1DI8Zero:          return "NVPTXISD::Suld1DI8Zero";
    821   case NVPTXISD::Suld1DI16Zero:         return "NVPTXISD::Suld1DI16Zero";
    822   case NVPTXISD::Suld1DI32Zero:         return "NVPTXISD::Suld1DI32Zero";
    823   case NVPTXISD::Suld1DI64Zero:         return "NVPTXISD::Suld1DI64Zero";
    824   case NVPTXISD::Suld1DV2I8Zero:        return "NVPTXISD::Suld1DV2I8Zero";
    825   case NVPTXISD::Suld1DV2I16Zero:       return "NVPTXISD::Suld1DV2I16Zero";
    826   case NVPTXISD::Suld1DV2I32Zero:       return "NVPTXISD::Suld1DV2I32Zero";
    827   case NVPTXISD::Suld1DV2I64Zero:       return "NVPTXISD::Suld1DV2I64Zero";
    828   case NVPTXISD::Suld1DV4I8Zero:        return "NVPTXISD::Suld1DV4I8Zero";
    829   case NVPTXISD::Suld1DV4I16Zero:       return "NVPTXISD::Suld1DV4I16Zero";
    830   case NVPTXISD::Suld1DV4I32Zero:       return "NVPTXISD::Suld1DV4I32Zero";
    831 
    832   case NVPTXISD::Suld1DArrayI8Zero:     return "NVPTXISD::Suld1DArrayI8Zero";
    833   case NVPTXISD::Suld1DArrayI16Zero:    return "NVPTXISD::Suld1DArrayI16Zero";
    834   case NVPTXISD::Suld1DArrayI32Zero:    return "NVPTXISD::Suld1DArrayI32Zero";
    835   case NVPTXISD::Suld1DArrayI64Zero:    return "NVPTXISD::Suld1DArrayI64Zero";
    836   case NVPTXISD::Suld1DArrayV2I8Zero:   return "NVPTXISD::Suld1DArrayV2I8Zero";
    837   case NVPTXISD::Suld1DArrayV2I16Zero:  return "NVPTXISD::Suld1DArrayV2I16Zero";
    838   case NVPTXISD::Suld1DArrayV2I32Zero:  return "NVPTXISD::Suld1DArrayV2I32Zero";
    839   case NVPTXISD::Suld1DArrayV2I64Zero:  return "NVPTXISD::Suld1DArrayV2I64Zero";
    840   case NVPTXISD::Suld1DArrayV4I8Zero:   return "NVPTXISD::Suld1DArrayV4I8Zero";
    841   case NVPTXISD::Suld1DArrayV4I16Zero:  return "NVPTXISD::Suld1DArrayV4I16Zero";
    842   case NVPTXISD::Suld1DArrayV4I32Zero:  return "NVPTXISD::Suld1DArrayV4I32Zero";
    843 
    844   case NVPTXISD::Suld2DI8Zero:          return "NVPTXISD::Suld2DI8Zero";
    845   case NVPTXISD::Suld2DI16Zero:         return "NVPTXISD::Suld2DI16Zero";
    846   case NVPTXISD::Suld2DI32Zero:         return "NVPTXISD::Suld2DI32Zero";
    847   case NVPTXISD::Suld2DI64Zero:         return "NVPTXISD::Suld2DI64Zero";
    848   case NVPTXISD::Suld2DV2I8Zero:        return "NVPTXISD::Suld2DV2I8Zero";
    849   case NVPTXISD::Suld2DV2I16Zero:       return "NVPTXISD::Suld2DV2I16Zero";
    850   case NVPTXISD::Suld2DV2I32Zero:       return "NVPTXISD::Suld2DV2I32Zero";
    851   case NVPTXISD::Suld2DV2I64Zero:       return "NVPTXISD::Suld2DV2I64Zero";
    852   case NVPTXISD::Suld2DV4I8Zero:        return "NVPTXISD::Suld2DV4I8Zero";
    853   case NVPTXISD::Suld2DV4I16Zero:       return "NVPTXISD::Suld2DV4I16Zero";
    854   case NVPTXISD::Suld2DV4I32Zero:       return "NVPTXISD::Suld2DV4I32Zero";
    855 
    856   case NVPTXISD::Suld2DArrayI8Zero:     return "NVPTXISD::Suld2DArrayI8Zero";
    857   case NVPTXISD::Suld2DArrayI16Zero:    return "NVPTXISD::Suld2DArrayI16Zero";
    858   case NVPTXISD::Suld2DArrayI32Zero:    return "NVPTXISD::Suld2DArrayI32Zero";
    859   case NVPTXISD::Suld2DArrayI64Zero:    return "NVPTXISD::Suld2DArrayI64Zero";
    860   case NVPTXISD::Suld2DArrayV2I8Zero:   return "NVPTXISD::Suld2DArrayV2I8Zero";
    861   case NVPTXISD::Suld2DArrayV2I16Zero:  return "NVPTXISD::Suld2DArrayV2I16Zero";
    862   case NVPTXISD::Suld2DArrayV2I32Zero:  return "NVPTXISD::Suld2DArrayV2I32Zero";
    863   case NVPTXISD::Suld2DArrayV2I64Zero:  return "NVPTXISD::Suld2DArrayV2I64Zero";
    864   case NVPTXISD::Suld2DArrayV4I8Zero:   return "NVPTXISD::Suld2DArrayV4I8Zero";
    865   case NVPTXISD::Suld2DArrayV4I16Zero:  return "NVPTXISD::Suld2DArrayV4I16Zero";
    866   case NVPTXISD::Suld2DArrayV4I32Zero:  return "NVPTXISD::Suld2DArrayV4I32Zero";
    867 
    868   case NVPTXISD::Suld3DI8Zero:          return "NVPTXISD::Suld3DI8Zero";
    869   case NVPTXISD::Suld3DI16Zero:         return "NVPTXISD::Suld3DI16Zero";
    870   case NVPTXISD::Suld3DI32Zero:         return "NVPTXISD::Suld3DI32Zero";
    871   case NVPTXISD::Suld3DI64Zero:         return "NVPTXISD::Suld3DI64Zero";
    872   case NVPTXISD::Suld3DV2I8Zero:        return "NVPTXISD::Suld3DV2I8Zero";
    873   case NVPTXISD::Suld3DV2I16Zero:       return "NVPTXISD::Suld3DV2I16Zero";
    874   case NVPTXISD::Suld3DV2I32Zero:       return "NVPTXISD::Suld3DV2I32Zero";
    875   case NVPTXISD::Suld3DV2I64Zero:       return "NVPTXISD::Suld3DV2I64Zero";
    876   case NVPTXISD::Suld3DV4I8Zero:        return "NVPTXISD::Suld3DV4I8Zero";
    877   case NVPTXISD::Suld3DV4I16Zero:       return "NVPTXISD::Suld3DV4I16Zero";
    878   case NVPTXISD::Suld3DV4I32Zero:       return "NVPTXISD::Suld3DV4I32Zero";
    879   }
    880   return nullptr;
    881 }
    882 
    883 TargetLoweringBase::LegalizeTypeAction
    884 NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const {
    885   if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
    886     return TypeSplitVector;
    887 
    888   return TargetLoweringBase::getPreferredVectorAction(VT);
    889 }
    890 
    891 SDValue
    892 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
    893   SDLoc dl(Op);
    894   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
    895   auto PtrVT = getPointerTy(DAG.getDataLayout());
    896   Op = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
    897   return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
    898 }
    899 
    900 std::string NVPTXTargetLowering::getPrototype(
    901     const DataLayout &DL, Type *retTy, const ArgListTy &Args,
    902     const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment,
    903     const ImmutableCallSite *CS) const {
    904   auto PtrVT = getPointerTy(DL);
    905 
    906   bool isABI = (STI.getSmVersion() >= 20);
    907   assert(isABI && "Non-ABI compilation is not supported");
    908   if (!isABI)
    909     return "";
    910 
    911   std::stringstream O;
    912   O << "prototype_" << uniqueCallSite << " : .callprototype ";
    913 
    914   if (retTy->getTypeID() == Type::VoidTyID) {
    915     O << "()";
    916   } else {
    917     O << "(";
    918     if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) {
    919       unsigned size = 0;
    920       if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
    921         size = ITy->getBitWidth();
    922         if (size < 32)
    923           size = 32;
    924       } else {
    925         assert(retTy->isFloatingPointTy() &&
    926                "Floating point type expected here");
    927         size = retTy->getPrimitiveSizeInBits();
    928       }
    929 
    930       O << ".param .b" << size << " _";
    931     } else if (isa<PointerType>(retTy)) {
    932       O << ".param .b" << PtrVT.getSizeInBits() << " _";
    933     } else if ((retTy->getTypeID() == Type::StructTyID) ||
    934                isa<VectorType>(retTy)) {
    935       auto &DL = CS->getCalledFunction()->getParent()->getDataLayout();
    936       O << ".param .align " << retAlignment << " .b8 _["
    937         << DL.getTypeAllocSize(retTy) << "]";
    938     } else {
    939       llvm_unreachable("Unknown return type");
    940     }
    941     O << ") ";
    942   }
    943   O << "_ (";
    944 
    945   bool first = true;
    946 
    947   unsigned OIdx = 0;
    948   for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
    949     Type *Ty = Args[i].Ty;
    950     if (!first) {
    951       O << ", ";
    952     }
    953     first = false;
    954 
    955     if (!Outs[OIdx].Flags.isByVal()) {
    956       if (Ty->isAggregateType() || Ty->isVectorTy()) {
    957         unsigned align = 0;
    958         const CallInst *CallI = cast<CallInst>(CS->getInstruction());
    959         // +1 because index 0 is reserved for return type alignment
    960         if (!llvm::getAlign(*CallI, i + 1, align))
    961           align = DL.getABITypeAlignment(Ty);
    962         unsigned sz = DL.getTypeAllocSize(Ty);
    963         O << ".param .align " << align << " .b8 ";
    964         O << "_";
    965         O << "[" << sz << "]";
    966         // update the index for Outs
    967         SmallVector<EVT, 16> vtparts;
    968         ComputeValueVTs(*this, DL, Ty, vtparts);
    969         if (unsigned len = vtparts.size())
    970           OIdx += len - 1;
    971         continue;
    972       }
    973        // i8 types in IR will be i16 types in SDAG
    974       assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
    975               (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
    976              "type mismatch between callee prototype and arguments");
    977       // scalar type
    978       unsigned sz = 0;
    979       if (isa<IntegerType>(Ty)) {
    980         sz = cast<IntegerType>(Ty)->getBitWidth();
    981         if (sz < 32)
    982           sz = 32;
    983       } else if (isa<PointerType>(Ty))
    984         sz = PtrVT.getSizeInBits();
    985       else
    986         sz = Ty->getPrimitiveSizeInBits();
    987       O << ".param .b" << sz << " ";
    988       O << "_";
    989       continue;
    990     }
    991     auto *PTy = dyn_cast<PointerType>(Ty);
    992     assert(PTy && "Param with byval attribute should be a pointer type");
    993     Type *ETy = PTy->getElementType();
    994 
    995     unsigned align = Outs[OIdx].Flags.getByValAlign();
    996     unsigned sz = DL.getTypeAllocSize(ETy);
    997     O << ".param .align " << align << " .b8 ";
    998     O << "_";
    999     O << "[" << sz << "]";
   1000   }
   1001   O << ");";
   1002   return O.str();
   1003 }
   1004 
   1005 unsigned
   1006 NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
   1007                                           const ImmutableCallSite *CS,
   1008                                           Type *Ty,
   1009                                           unsigned Idx) const {
   1010   unsigned Align = 0;
   1011   const Value *DirectCallee = CS->getCalledFunction();
   1012 
   1013   if (!DirectCallee) {
   1014     // We don't have a direct function symbol, but that may be because of
   1015     // constant cast instructions in the call.
   1016     const Instruction *CalleeI = CS->getInstruction();
   1017     assert(CalleeI && "Call target is not a function or derived value?");
   1018 
   1019     // With bitcast'd call targets, the instruction will be the call
   1020     if (isa<CallInst>(CalleeI)) {
   1021       // Check if we have call alignment metadata
   1022       if (llvm::getAlign(*cast<CallInst>(CalleeI), Idx, Align))
   1023         return Align;
   1024 
   1025       const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
   1026       // Ignore any bitcast instructions
   1027       while(isa<ConstantExpr>(CalleeV)) {
   1028         const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
   1029         if (!CE->isCast())
   1030           break;
   1031         // Look through the bitcast
   1032         CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
   1033       }
   1034 
   1035       // We have now looked past all of the bitcasts.  Do we finally have a
   1036       // Function?
   1037       if (isa<Function>(CalleeV))
   1038         DirectCallee = CalleeV;
   1039     }
   1040   }
   1041 
   1042   // Check for function alignment information if we found that the
   1043   // ultimate target is a Function
   1044   if (DirectCallee)
   1045     if (llvm::getAlign(*cast<Function>(DirectCallee), Idx, Align))
   1046       return Align;
   1047 
   1048   // Call is indirect or alignment information is not available, fall back to
   1049   // the ABI type alignment
   1050   auto &DL = CS->getCaller()->getParent()->getDataLayout();
   1051   return DL.getABITypeAlignment(Ty);
   1052 }
   1053 
   1054 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   1055                                        SmallVectorImpl<SDValue> &InVals) const {
   1056   SelectionDAG &DAG = CLI.DAG;
   1057   SDLoc dl = CLI.DL;
   1058   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
   1059   SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
   1060   SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
   1061   SDValue Chain = CLI.Chain;
   1062   SDValue Callee = CLI.Callee;
   1063   bool &isTailCall = CLI.IsTailCall;
   1064   ArgListTy &Args = CLI.getArgs();
   1065   Type *retTy = CLI.RetTy;
   1066   ImmutableCallSite *CS = CLI.CS;
   1067 
   1068   bool isABI = (STI.getSmVersion() >= 20);
   1069   assert(isABI && "Non-ABI compilation is not supported");
   1070   if (!isABI)
   1071     return Chain;
   1072   MachineFunction &MF = DAG.getMachineFunction();
   1073   const Function *F = MF.getFunction();
   1074   auto &DL = MF.getDataLayout();
   1075 
   1076   SDValue tempChain = Chain;
   1077   Chain = DAG.getCALLSEQ_START(Chain,
   1078                                DAG.getIntPtrConstant(uniqueCallSite, dl, true),
   1079                                dl);
   1080   SDValue InFlag = Chain.getValue(1);
   1081 
   1082   unsigned paramCount = 0;
   1083   // Args.size() and Outs.size() need not match.
   1084   // Outs.size() will be larger
   1085   //   * if there is an aggregate argument with multiple fields (each field
   1086   //     showing up separately in Outs)
   1087   //   * if there is a vector argument with more than typical vector-length
   1088   //     elements (generally if more than 4) where each vector element is
   1089   //     individually present in Outs.
   1090   // So a different index should be used for indexing into Outs/OutVals.
   1091   // See similar issue in LowerFormalArguments.
   1092   unsigned OIdx = 0;
   1093   // Declare the .params or .reg need to pass values
   1094   // to the function
   1095   for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
   1096     EVT VT = Outs[OIdx].VT;
   1097     Type *Ty = Args[i].Ty;
   1098 
   1099     if (!Outs[OIdx].Flags.isByVal()) {
   1100       if (Ty->isAggregateType()) {
   1101         // aggregate
   1102         SmallVector<EVT, 16> vtparts;
   1103         SmallVector<uint64_t, 16> Offsets;
   1104         ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &Offsets,
   1105                            0);
   1106 
   1107         unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
   1108         // declare .param .align <align> .b8 .param<n>[<size>];
   1109         unsigned sz = DL.getTypeAllocSize(Ty);
   1110         SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1111         SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, dl,
   1112                                                              MVT::i32),
   1113                                       DAG.getConstant(paramCount, dl, MVT::i32),
   1114                                       DAG.getConstant(sz, dl, MVT::i32),
   1115                                       InFlag };
   1116         Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
   1117                             DeclareParamOps);
   1118         InFlag = Chain.getValue(1);
   1119         for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
   1120           EVT elemtype = vtparts[j];
   1121           unsigned ArgAlign = GreatestCommonDivisor64(align, Offsets[j]);
   1122           if (elemtype.isInteger() && (sz < 8))
   1123             sz = 8;
   1124           SDValue StVal = OutVals[OIdx];
   1125           if (elemtype.getSizeInBits() < 16) {
   1126             StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
   1127           }
   1128           SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1129           SDValue CopyParamOps[] = { Chain,
   1130                                      DAG.getConstant(paramCount, dl, MVT::i32),
   1131                                      DAG.getConstant(Offsets[j], dl, MVT::i32),
   1132                                      StVal, InFlag };
   1133           Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
   1134                                           CopyParamVTs, CopyParamOps,
   1135                                           elemtype, MachinePointerInfo(),
   1136                                           ArgAlign);
   1137           InFlag = Chain.getValue(1);
   1138           ++OIdx;
   1139         }
   1140         if (vtparts.size() > 0)
   1141           --OIdx;
   1142         ++paramCount;
   1143         continue;
   1144       }
   1145       if (Ty->isVectorTy()) {
   1146         EVT ObjectVT = getValueType(DL, Ty);
   1147         unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
   1148         // declare .param .align <align> .b8 .param<n>[<size>];
   1149         unsigned sz = DL.getTypeAllocSize(Ty);
   1150         SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1151         SDValue DeclareParamOps[] = { Chain,
   1152                                       DAG.getConstant(align, dl, MVT::i32),
   1153                                       DAG.getConstant(paramCount, dl, MVT::i32),
   1154                                       DAG.getConstant(sz, dl, MVT::i32),
   1155                                       InFlag };
   1156         Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
   1157                             DeclareParamOps);
   1158         InFlag = Chain.getValue(1);
   1159         unsigned NumElts = ObjectVT.getVectorNumElements();
   1160         EVT EltVT = ObjectVT.getVectorElementType();
   1161         EVT MemVT = EltVT;
   1162         bool NeedExtend = false;
   1163         if (EltVT.getSizeInBits() < 16) {
   1164           NeedExtend = true;
   1165           EltVT = MVT::i16;
   1166         }
   1167 
   1168         // V1 store
   1169         if (NumElts == 1) {
   1170           SDValue Elt = OutVals[OIdx++];
   1171           if (NeedExtend)
   1172             Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt);
   1173 
   1174           SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1175           SDValue CopyParamOps[] = { Chain,
   1176                                      DAG.getConstant(paramCount, dl, MVT::i32),
   1177                                      DAG.getConstant(0, dl, MVT::i32), Elt,
   1178                                      InFlag };
   1179           Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
   1180                                           CopyParamVTs, CopyParamOps,
   1181                                           MemVT, MachinePointerInfo());
   1182           InFlag = Chain.getValue(1);
   1183         } else if (NumElts == 2) {
   1184           SDValue Elt0 = OutVals[OIdx++];
   1185           SDValue Elt1 = OutVals[OIdx++];
   1186           if (NeedExtend) {
   1187             Elt0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt0);
   1188             Elt1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt1);
   1189           }
   1190 
   1191           SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1192           SDValue CopyParamOps[] = { Chain,
   1193                                      DAG.getConstant(paramCount, dl, MVT::i32),
   1194                                      DAG.getConstant(0, dl, MVT::i32), Elt0,
   1195                                      Elt1, InFlag };
   1196           Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParamV2, dl,
   1197                                           CopyParamVTs, CopyParamOps,
   1198                                           MemVT, MachinePointerInfo());
   1199           InFlag = Chain.getValue(1);
   1200         } else {
   1201           unsigned curOffset = 0;
   1202           // V4 stores
   1203           // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
   1204           // the
   1205           // vector will be expanded to a power of 2 elements, so we know we can
   1206           // always round up to the next multiple of 4 when creating the vector
   1207           // stores.
   1208           // e.g.  4 elem => 1 st.v4
   1209           //       6 elem => 2 st.v4
   1210           //       8 elem => 2 st.v4
   1211           //      11 elem => 3 st.v4
   1212           unsigned VecSize = 4;
   1213           if (EltVT.getSizeInBits() == 64)
   1214             VecSize = 2;
   1215 
   1216           // This is potentially only part of a vector, so assume all elements
   1217           // are packed together.
   1218           unsigned PerStoreOffset = MemVT.getStoreSizeInBits() / 8 * VecSize;
   1219 
   1220           for (unsigned i = 0; i < NumElts; i += VecSize) {
   1221             // Get values
   1222             SDValue StoreVal;
   1223             SmallVector<SDValue, 8> Ops;
   1224             Ops.push_back(Chain);
   1225             Ops.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
   1226             Ops.push_back(DAG.getConstant(curOffset, dl, MVT::i32));
   1227 
   1228             unsigned Opc = NVPTXISD::StoreParamV2;
   1229 
   1230             StoreVal = OutVals[OIdx++];
   1231             if (NeedExtend)
   1232               StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
   1233             Ops.push_back(StoreVal);
   1234 
   1235             if (i + 1 < NumElts) {
   1236               StoreVal = OutVals[OIdx++];
   1237               if (NeedExtend)
   1238                 StoreVal =
   1239                     DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
   1240             } else {
   1241               StoreVal = DAG.getUNDEF(EltVT);
   1242             }
   1243             Ops.push_back(StoreVal);
   1244 
   1245             if (VecSize == 4) {
   1246               Opc = NVPTXISD::StoreParamV4;
   1247               if (i + 2 < NumElts) {
   1248                 StoreVal = OutVals[OIdx++];
   1249                 if (NeedExtend)
   1250                   StoreVal =
   1251                       DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
   1252               } else {
   1253                 StoreVal = DAG.getUNDEF(EltVT);
   1254               }
   1255               Ops.push_back(StoreVal);
   1256 
   1257               if (i + 3 < NumElts) {
   1258                 StoreVal = OutVals[OIdx++];
   1259                 if (NeedExtend)
   1260                   StoreVal =
   1261                       DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
   1262               } else {
   1263                 StoreVal = DAG.getUNDEF(EltVT);
   1264               }
   1265               Ops.push_back(StoreVal);
   1266             }
   1267 
   1268             Ops.push_back(InFlag);
   1269 
   1270             SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1271             Chain = DAG.getMemIntrinsicNode(Opc, dl, CopyParamVTs, Ops,
   1272                                             MemVT, MachinePointerInfo());
   1273             InFlag = Chain.getValue(1);
   1274             curOffset += PerStoreOffset;
   1275           }
   1276         }
   1277         ++paramCount;
   1278         --OIdx;
   1279         continue;
   1280       }
   1281       // Plain scalar
   1282       // for ABI,    declare .param .b<size> .param<n>;
   1283       unsigned sz = VT.getSizeInBits();
   1284       bool needExtend = false;
   1285       if (VT.isInteger()) {
   1286         if (sz < 16)
   1287           needExtend = true;
   1288         if (sz < 32)
   1289           sz = 32;
   1290       }
   1291       SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1292       SDValue DeclareParamOps[] = { Chain,
   1293                                     DAG.getConstant(paramCount, dl, MVT::i32),
   1294                                     DAG.getConstant(sz, dl, MVT::i32),
   1295                                     DAG.getConstant(0, dl, MVT::i32), InFlag };
   1296       Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
   1297                           DeclareParamOps);
   1298       InFlag = Chain.getValue(1);
   1299       SDValue OutV = OutVals[OIdx];
   1300       if (needExtend) {
   1301         // zext/sext i1 to i16
   1302         unsigned opc = ISD::ZERO_EXTEND;
   1303         if (Outs[OIdx].Flags.isSExt())
   1304           opc = ISD::SIGN_EXTEND;
   1305         OutV = DAG.getNode(opc, dl, MVT::i16, OutV);
   1306       }
   1307       SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1308       SDValue CopyParamOps[] = { Chain,
   1309                                  DAG.getConstant(paramCount, dl, MVT::i32),
   1310                                  DAG.getConstant(0, dl, MVT::i32), OutV,
   1311                                  InFlag };
   1312 
   1313       unsigned opcode = NVPTXISD::StoreParam;
   1314       if (Outs[OIdx].Flags.isZExt() && VT.getSizeInBits() < 32)
   1315         opcode = NVPTXISD::StoreParamU32;
   1316       else if (Outs[OIdx].Flags.isSExt() && VT.getSizeInBits() < 32)
   1317         opcode = NVPTXISD::StoreParamS32;
   1318       Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps,
   1319                                       VT, MachinePointerInfo());
   1320 
   1321       InFlag = Chain.getValue(1);
   1322       ++paramCount;
   1323       continue;
   1324     }
   1325     // struct or vector
   1326     SmallVector<EVT, 16> vtparts;
   1327     SmallVector<uint64_t, 16> Offsets;
   1328     auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
   1329     assert(PTy && "Type of a byval parameter should be pointer");
   1330     ComputePTXValueVTs(*this, DAG.getDataLayout(), PTy->getElementType(),
   1331                        vtparts, &Offsets, 0);
   1332 
   1333     // declare .param .align <align> .b8 .param<n>[<size>];
   1334     unsigned sz = Outs[OIdx].Flags.getByValSize();
   1335     SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1336     unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
   1337     // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
   1338     // so we don't need to worry about natural alignment or not.
   1339     // See TargetLowering::LowerCallTo().
   1340     SDValue DeclareParamOps[] = {
   1341       Chain, DAG.getConstant(Outs[OIdx].Flags.getByValAlign(), dl, MVT::i32),
   1342       DAG.getConstant(paramCount, dl, MVT::i32),
   1343       DAG.getConstant(sz, dl, MVT::i32), InFlag
   1344     };
   1345     Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
   1346                         DeclareParamOps);
   1347     InFlag = Chain.getValue(1);
   1348     for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
   1349       EVT elemtype = vtparts[j];
   1350       int curOffset = Offsets[j];
   1351       unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
   1352       auto PtrVT = getPointerTy(DAG.getDataLayout());
   1353       SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
   1354                                     DAG.getConstant(curOffset, dl, PtrVT));
   1355       SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
   1356                                    MachinePointerInfo(), false, false, false,
   1357                                    PartAlign);
   1358       if (elemtype.getSizeInBits() < 16) {
   1359         theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
   1360       }
   1361       SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1362       SDValue CopyParamOps[] = { Chain,
   1363                                  DAG.getConstant(paramCount, dl, MVT::i32),
   1364                                  DAG.getConstant(curOffset, dl, MVT::i32),
   1365                                  theVal, InFlag };
   1366       Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
   1367                                       CopyParamOps, elemtype,
   1368                                       MachinePointerInfo());
   1369 
   1370       InFlag = Chain.getValue(1);
   1371     }
   1372     ++paramCount;
   1373   }
   1374 
   1375   GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
   1376   unsigned retAlignment = 0;
   1377 
   1378   // Handle Result
   1379   if (Ins.size() > 0) {
   1380     SmallVector<EVT, 16> resvtparts;
   1381     ComputeValueVTs(*this, DL, retTy, resvtparts);
   1382 
   1383     // Declare
   1384     //  .param .align 16 .b8 retval0[<size-in-bytes>], or
   1385     //  .param .b<size-in-bits> retval0
   1386     unsigned resultsz = DL.getTypeAllocSizeInBits(retTy);
   1387     // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
   1388     // these three types to match the logic in
   1389     // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
   1390     // Plus, this behavior is consistent with nvcc's.
   1391     if (retTy->isFloatingPointTy() || retTy->isIntegerTy() ||
   1392         retTy->isPointerTy()) {
   1393       // Scalar needs to be at least 32bit wide
   1394       if (resultsz < 32)
   1395         resultsz = 32;
   1396       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1397       SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
   1398                                   DAG.getConstant(resultsz, dl, MVT::i32),
   1399                                   DAG.getConstant(0, dl, MVT::i32), InFlag };
   1400       Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
   1401                           DeclareRetOps);
   1402       InFlag = Chain.getValue(1);
   1403     } else {
   1404       retAlignment = getArgumentAlignment(Callee, CS, retTy, 0);
   1405       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1406       SDValue DeclareRetOps[] = { Chain,
   1407                                   DAG.getConstant(retAlignment, dl, MVT::i32),
   1408                                   DAG.getConstant(resultsz / 8, dl, MVT::i32),
   1409                                   DAG.getConstant(0, dl, MVT::i32), InFlag };
   1410       Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
   1411                           DeclareRetOps);
   1412       InFlag = Chain.getValue(1);
   1413     }
   1414   }
   1415 
   1416   if (!Func) {
   1417     // This is indirect function call case : PTX requires a prototype of the
   1418     // form
   1419     // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
   1420     // to be emitted, and the label has to used as the last arg of call
   1421     // instruction.
   1422     // The prototype is embedded in a string and put as the operand for a
   1423     // CallPrototype SDNode which will print out to the value of the string.
   1424     SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1425     std::string Proto =
   1426         getPrototype(DAG.getDataLayout(), retTy, Args, Outs, retAlignment, CS);
   1427     const char *ProtoStr =
   1428       nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
   1429     SDValue ProtoOps[] = {
   1430       Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
   1431     };
   1432     Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
   1433     InFlag = Chain.getValue(1);
   1434   }
   1435   // Op to just print "call"
   1436   SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1437   SDValue PrintCallOps[] = {
   1438     Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
   1439   };
   1440   // We model convergent calls as separate opcodes.
   1441   unsigned Opcode = Func ? NVPTXISD::PrintCallUni : NVPTXISD::PrintCall;
   1442   if (CLI.IsConvergent)
   1443     Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
   1444                                               : NVPTXISD::PrintConvergentCall;
   1445   Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
   1446   InFlag = Chain.getValue(1);
   1447 
   1448   // Ops to print out the function name
   1449   SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1450   SDValue CallVoidOps[] = { Chain, Callee, InFlag };
   1451   Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
   1452   InFlag = Chain.getValue(1);
   1453 
   1454   // Ops to print out the param list
   1455   SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1456   SDValue CallArgBeginOps[] = { Chain, InFlag };
   1457   Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
   1458                       CallArgBeginOps);
   1459   InFlag = Chain.getValue(1);
   1460 
   1461   for (unsigned i = 0, e = paramCount; i != e; ++i) {
   1462     unsigned opcode;
   1463     if (i == (e - 1))
   1464       opcode = NVPTXISD::LastCallArg;
   1465     else
   1466       opcode = NVPTXISD::CallArg;
   1467     SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1468     SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
   1469                              DAG.getConstant(i, dl, MVT::i32), InFlag };
   1470     Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
   1471     InFlag = Chain.getValue(1);
   1472   }
   1473   SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1474   SDValue CallArgEndOps[] = { Chain,
   1475                               DAG.getConstant(Func ? 1 : 0, dl, MVT::i32),
   1476                               InFlag };
   1477   Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
   1478   InFlag = Chain.getValue(1);
   1479 
   1480   if (!Func) {
   1481     SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   1482     SDValue PrototypeOps[] = { Chain,
   1483                                DAG.getConstant(uniqueCallSite, dl, MVT::i32),
   1484                                InFlag };
   1485     Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
   1486     InFlag = Chain.getValue(1);
   1487   }
   1488 
   1489   // Generate loads from param memory/moves from registers for result
   1490   if (Ins.size() > 0) {
   1491     if (retTy && retTy->isVectorTy()) {
   1492       EVT ObjectVT = getValueType(DL, retTy);
   1493       unsigned NumElts = ObjectVT.getVectorNumElements();
   1494       EVT EltVT = ObjectVT.getVectorElementType();
   1495       assert(STI.getTargetLowering()->getNumRegisters(F->getContext(),
   1496                                                       ObjectVT) == NumElts &&
   1497              "Vector was not scalarized");
   1498       unsigned sz = EltVT.getSizeInBits();
   1499       bool needTruncate = sz < 8;
   1500 
   1501       if (NumElts == 1) {
   1502         // Just a simple load
   1503         SmallVector<EVT, 4> LoadRetVTs;
   1504         if (EltVT == MVT::i1 || EltVT == MVT::i8) {
   1505           // If loading i1/i8 result, generate
   1506           //   load.b8 i16
   1507           //   if i1
   1508           //   trunc i16 to i1
   1509           LoadRetVTs.push_back(MVT::i16);
   1510         } else
   1511           LoadRetVTs.push_back(EltVT);
   1512         LoadRetVTs.push_back(MVT::Other);
   1513         LoadRetVTs.push_back(MVT::Glue);
   1514         SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
   1515                                 DAG.getConstant(0, dl, MVT::i32), InFlag};
   1516         SDValue retval = DAG.getMemIntrinsicNode(
   1517             NVPTXISD::LoadParam, dl,
   1518             DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
   1519         Chain = retval.getValue(1);
   1520         InFlag = retval.getValue(2);
   1521         SDValue Ret0 = retval;
   1522         if (needTruncate)
   1523           Ret0 = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Ret0);
   1524         InVals.push_back(Ret0);
   1525       } else if (NumElts == 2) {
   1526         // LoadV2
   1527         SmallVector<EVT, 4> LoadRetVTs;
   1528         if (EltVT == MVT::i1 || EltVT == MVT::i8) {
   1529           // If loading i1/i8 result, generate
   1530           //   load.b8 i16
   1531           //   if i1
   1532           //   trunc i16 to i1
   1533           LoadRetVTs.push_back(MVT::i16);
   1534           LoadRetVTs.push_back(MVT::i16);
   1535         } else {
   1536           LoadRetVTs.push_back(EltVT);
   1537           LoadRetVTs.push_back(EltVT);
   1538         }
   1539         LoadRetVTs.push_back(MVT::Other);
   1540         LoadRetVTs.push_back(MVT::Glue);
   1541         SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
   1542                                 DAG.getConstant(0, dl, MVT::i32), InFlag};
   1543         SDValue retval = DAG.getMemIntrinsicNode(
   1544             NVPTXISD::LoadParamV2, dl,
   1545             DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
   1546         Chain = retval.getValue(2);
   1547         InFlag = retval.getValue(3);
   1548         SDValue Ret0 = retval.getValue(0);
   1549         SDValue Ret1 = retval.getValue(1);
   1550         if (needTruncate) {
   1551           Ret0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret0);
   1552           InVals.push_back(Ret0);
   1553           Ret1 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret1);
   1554           InVals.push_back(Ret1);
   1555         } else {
   1556           InVals.push_back(Ret0);
   1557           InVals.push_back(Ret1);
   1558         }
   1559       } else {
   1560         // Split into N LoadV4
   1561         unsigned Ofst = 0;
   1562         unsigned VecSize = 4;
   1563         unsigned Opc = NVPTXISD::LoadParamV4;
   1564         if (EltVT.getSizeInBits() == 64) {
   1565           VecSize = 2;
   1566           Opc = NVPTXISD::LoadParamV2;
   1567         }
   1568         EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
   1569         for (unsigned i = 0; i < NumElts; i += VecSize) {
   1570           SmallVector<EVT, 8> LoadRetVTs;
   1571           if (EltVT == MVT::i1 || EltVT == MVT::i8) {
   1572             // If loading i1/i8 result, generate
   1573             //   load.b8 i16
   1574             //   if i1
   1575             //   trunc i16 to i1
   1576             for (unsigned j = 0; j < VecSize; ++j)
   1577               LoadRetVTs.push_back(MVT::i16);
   1578           } else {
   1579             for (unsigned j = 0; j < VecSize; ++j)
   1580               LoadRetVTs.push_back(EltVT);
   1581           }
   1582           LoadRetVTs.push_back(MVT::Other);
   1583           LoadRetVTs.push_back(MVT::Glue);
   1584           SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
   1585                                   DAG.getConstant(Ofst, dl, MVT::i32), InFlag};
   1586           SDValue retval = DAG.getMemIntrinsicNode(
   1587               Opc, dl, DAG.getVTList(LoadRetVTs),
   1588               LoadRetOps, EltVT, MachinePointerInfo());
   1589           if (VecSize == 2) {
   1590             Chain = retval.getValue(2);
   1591             InFlag = retval.getValue(3);
   1592           } else {
   1593             Chain = retval.getValue(4);
   1594             InFlag = retval.getValue(5);
   1595           }
   1596 
   1597           for (unsigned j = 0; j < VecSize; ++j) {
   1598             if (i + j >= NumElts)
   1599               break;
   1600             SDValue Elt = retval.getValue(j);
   1601             if (needTruncate)
   1602               Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
   1603             InVals.push_back(Elt);
   1604           }
   1605           Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
   1606         }
   1607       }
   1608     } else {
   1609       SmallVector<EVT, 16> VTs;
   1610       SmallVector<uint64_t, 16> Offsets;
   1611       ComputePTXValueVTs(*this, DAG.getDataLayout(), retTy, VTs, &Offsets, 0);
   1612       assert(VTs.size() == Ins.size() && "Bad value decomposition");
   1613       unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0);
   1614       for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
   1615         unsigned sz = VTs[i].getSizeInBits();
   1616         unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]);
   1617         bool needTruncate = false;
   1618         if (VTs[i].isInteger() && sz < 8) {
   1619           sz = 8;
   1620           needTruncate = true;
   1621         }
   1622 
   1623         SmallVector<EVT, 4> LoadRetVTs;
   1624         EVT TheLoadType = VTs[i];
   1625         if (retTy->isIntegerTy() && DL.getTypeAllocSizeInBits(retTy) < 32) {
   1626           // This is for integer types only, and specifically not for
   1627           // aggregates.
   1628           LoadRetVTs.push_back(MVT::i32);
   1629           TheLoadType = MVT::i32;
   1630           needTruncate = true;
   1631         } else if (sz < 16) {
   1632           // If loading i1/i8 result, generate
   1633           //   load i8 (-> i16)
   1634           //   trunc i16 to i1/i8
   1635 
   1636           // FIXME: Do we need to set needTruncate to true here, too?  We could
   1637           // not figure out what this branch is for in D17872, so we left it
   1638           // alone.  The comment above about loading i1/i8 may be wrong, as the
   1639           // branch above seems to cover integers of size < 32.
   1640           LoadRetVTs.push_back(MVT::i16);
   1641         } else
   1642           LoadRetVTs.push_back(Ins[i].VT);
   1643         LoadRetVTs.push_back(MVT::Other);
   1644         LoadRetVTs.push_back(MVT::Glue);
   1645 
   1646         SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
   1647                                 DAG.getConstant(Offsets[i], dl, MVT::i32),
   1648                                 InFlag};
   1649         SDValue retval = DAG.getMemIntrinsicNode(
   1650             NVPTXISD::LoadParam, dl,
   1651             DAG.getVTList(LoadRetVTs), LoadRetOps,
   1652             TheLoadType, MachinePointerInfo(), AlignI);
   1653         Chain = retval.getValue(1);
   1654         InFlag = retval.getValue(2);
   1655         SDValue Ret0 = retval.getValue(0);
   1656         if (needTruncate)
   1657           Ret0 = DAG.getNode(ISD::TRUNCATE, dl, Ins[i].VT, Ret0);
   1658         InVals.push_back(Ret0);
   1659       }
   1660     }
   1661   }
   1662 
   1663   Chain = DAG.getCALLSEQ_END(Chain,
   1664                              DAG.getIntPtrConstant(uniqueCallSite, dl, true),
   1665                              DAG.getIntPtrConstant(uniqueCallSite + 1, dl,
   1666                                                    true),
   1667                              InFlag, dl);
   1668   uniqueCallSite++;
   1669 
   1670   // set isTailCall to false for now, until we figure out how to express
   1671   // tail call optimization in PTX
   1672   isTailCall = false;
   1673   return Chain;
   1674 }
   1675 
   1676 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
   1677 // (see LegalizeDAG.cpp). This is slow and uses local memory.
   1678 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
   1679 SDValue
   1680 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
   1681   SDNode *Node = Op.getNode();
   1682   SDLoc dl(Node);
   1683   SmallVector<SDValue, 8> Ops;
   1684   unsigned NumOperands = Node->getNumOperands();
   1685   for (unsigned i = 0; i < NumOperands; ++i) {
   1686     SDValue SubOp = Node->getOperand(i);
   1687     EVT VVT = SubOp.getNode()->getValueType(0);
   1688     EVT EltVT = VVT.getVectorElementType();
   1689     unsigned NumSubElem = VVT.getVectorNumElements();
   1690     for (unsigned j = 0; j < NumSubElem; ++j) {
   1691       Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
   1692                                 DAG.getIntPtrConstant(j, dl)));
   1693     }
   1694   }
   1695   return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
   1696 }
   1697 
   1698 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
   1699 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
   1700 ///    amount, or
   1701 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
   1702 ///    amount.
   1703 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
   1704                                                   SelectionDAG &DAG) const {
   1705   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   1706   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
   1707 
   1708   EVT VT = Op.getValueType();
   1709   unsigned VTBits = VT.getSizeInBits();
   1710   SDLoc dl(Op);
   1711   SDValue ShOpLo = Op.getOperand(0);
   1712   SDValue ShOpHi = Op.getOperand(1);
   1713   SDValue ShAmt  = Op.getOperand(2);
   1714   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
   1715 
   1716   if (VTBits == 32 && STI.getSmVersion() >= 35) {
   1717 
   1718     // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
   1719     // {dHi, dLo} = {aHi, aLo} >> Amt
   1720     //   dHi = aHi >> Amt
   1721     //   dLo = shf.r.clamp aLo, aHi, Amt
   1722 
   1723     SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
   1724     SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
   1725                              ShAmt);
   1726 
   1727     SDValue Ops[2] = { Lo, Hi };
   1728     return DAG.getMergeValues(Ops, dl);
   1729   }
   1730   else {
   1731 
   1732     // {dHi, dLo} = {aHi, aLo} >> Amt
   1733     // - if (Amt>=size) then
   1734     //      dLo = aHi >> (Amt-size)
   1735     //      dHi = aHi >> Amt (this is either all 0 or all 1)
   1736     //   else
   1737     //      dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
   1738     //      dHi = aHi >> Amt
   1739 
   1740     SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
   1741                                    DAG.getConstant(VTBits, dl, MVT::i32),
   1742                                    ShAmt);
   1743     SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
   1744     SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
   1745                                      DAG.getConstant(VTBits, dl, MVT::i32));
   1746     SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
   1747     SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
   1748     SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
   1749 
   1750     SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
   1751                                DAG.getConstant(VTBits, dl, MVT::i32),
   1752                                ISD::SETGE);
   1753     SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
   1754     SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
   1755 
   1756     SDValue Ops[2] = { Lo, Hi };
   1757     return DAG.getMergeValues(Ops, dl);
   1758   }
   1759 }
   1760 
   1761 /// LowerShiftLeftParts - Lower SHL_PARTS, which
   1762 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
   1763 ///    amount, or
   1764 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
   1765 ///    amount.
   1766 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
   1767                                                  SelectionDAG &DAG) const {
   1768   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   1769   assert(Op.getOpcode() == ISD::SHL_PARTS);
   1770 
   1771   EVT VT = Op.getValueType();
   1772   unsigned VTBits = VT.getSizeInBits();
   1773   SDLoc dl(Op);
   1774   SDValue ShOpLo = Op.getOperand(0);
   1775   SDValue ShOpHi = Op.getOperand(1);
   1776   SDValue ShAmt  = Op.getOperand(2);
   1777 
   1778   if (VTBits == 32 && STI.getSmVersion() >= 35) {
   1779 
   1780     // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
   1781     // {dHi, dLo} = {aHi, aLo} << Amt
   1782     //   dHi = shf.l.clamp aLo, aHi, Amt
   1783     //   dLo = aLo << Amt
   1784 
   1785     SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
   1786                              ShAmt);
   1787     SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
   1788 
   1789     SDValue Ops[2] = { Lo, Hi };
   1790     return DAG.getMergeValues(Ops, dl);
   1791   }
   1792   else {
   1793 
   1794     // {dHi, dLo} = {aHi, aLo} << Amt
   1795     // - if (Amt>=size) then
   1796     //      dLo = aLo << Amt (all 0)
   1797     //      dLo = aLo << (Amt-size)
   1798     //   else
   1799     //      dLo = aLo << Amt
   1800     //      dHi = (aHi << Amt) | (aLo >> (size-Amt))
   1801 
   1802     SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
   1803                                    DAG.getConstant(VTBits, dl, MVT::i32),
   1804                                    ShAmt);
   1805     SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
   1806     SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
   1807                                      DAG.getConstant(VTBits, dl, MVT::i32));
   1808     SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
   1809     SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
   1810     SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
   1811 
   1812     SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
   1813                                DAG.getConstant(VTBits, dl, MVT::i32),
   1814                                ISD::SETGE);
   1815     SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
   1816     SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
   1817 
   1818     SDValue Ops[2] = { Lo, Hi };
   1819     return DAG.getMergeValues(Ops, dl);
   1820   }
   1821 }
   1822 
   1823 SDValue
   1824 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   1825   switch (Op.getOpcode()) {
   1826   case ISD::RETURNADDR:
   1827     return SDValue();
   1828   case ISD::FRAMEADDR:
   1829     return SDValue();
   1830   case ISD::GlobalAddress:
   1831     return LowerGlobalAddress(Op, DAG);
   1832   case ISD::INTRINSIC_W_CHAIN:
   1833     return Op;
   1834   case ISD::BUILD_VECTOR:
   1835   case ISD::EXTRACT_SUBVECTOR:
   1836     return Op;
   1837   case ISD::CONCAT_VECTORS:
   1838     return LowerCONCAT_VECTORS(Op, DAG);
   1839   case ISD::STORE:
   1840     return LowerSTORE(Op, DAG);
   1841   case ISD::LOAD:
   1842     return LowerLOAD(Op, DAG);
   1843   case ISD::SHL_PARTS:
   1844     return LowerShiftLeftParts(Op, DAG);
   1845   case ISD::SRA_PARTS:
   1846   case ISD::SRL_PARTS:
   1847     return LowerShiftRightParts(Op, DAG);
   1848   case ISD::SELECT:
   1849     return LowerSelect(Op, DAG);
   1850   default:
   1851     llvm_unreachable("Custom lowering not defined for operation");
   1852   }
   1853 }
   1854 
   1855 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
   1856   SDValue Op0 = Op->getOperand(0);
   1857   SDValue Op1 = Op->getOperand(1);
   1858   SDValue Op2 = Op->getOperand(2);
   1859   SDLoc DL(Op.getNode());
   1860 
   1861   assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
   1862 
   1863   Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
   1864   Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
   1865   SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
   1866   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
   1867 
   1868   return Trunc;
   1869 }
   1870 
   1871 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   1872   if (Op.getValueType() == MVT::i1)
   1873     return LowerLOADi1(Op, DAG);
   1874   else
   1875     return SDValue();
   1876 }
   1877 
   1878 // v = ld i1* addr
   1879 //   =>
   1880 // v1 = ld i8* addr (-> i16)
   1881 // v = trunc i16 to i1
   1882 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
   1883   SDNode *Node = Op.getNode();
   1884   LoadSDNode *LD = cast<LoadSDNode>(Node);
   1885   SDLoc dl(Node);
   1886   assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
   1887   assert(Node->getValueType(0) == MVT::i1 &&
   1888          "Custom lowering for i1 load only");
   1889   SDValue newLD =
   1890       DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
   1891                   LD->getPointerInfo(), LD->isVolatile(), LD->isNonTemporal(),
   1892                   LD->isInvariant(), LD->getAlignment());
   1893   SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
   1894   // The legalizer (the caller) is expecting two values from the legalized
   1895   // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
   1896   // in LegalizeDAG.cpp which also uses MergeValues.
   1897   SDValue Ops[] = { result, LD->getChain() };
   1898   return DAG.getMergeValues(Ops, dl);
   1899 }
   1900 
   1901 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   1902   EVT ValVT = Op.getOperand(1).getValueType();
   1903   if (ValVT == MVT::i1)
   1904     return LowerSTOREi1(Op, DAG);
   1905   else if (ValVT.isVector())
   1906     return LowerSTOREVector(Op, DAG);
   1907   else
   1908     return SDValue();
   1909 }
   1910 
   1911 SDValue
   1912 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
   1913   SDNode *N = Op.getNode();
   1914   SDValue Val = N->getOperand(1);
   1915   SDLoc DL(N);
   1916   EVT ValVT = Val.getValueType();
   1917 
   1918   if (ValVT.isVector()) {
   1919     // We only handle "native" vector sizes for now, e.g. <4 x double> is not
   1920     // legal.  We can (and should) split that into 2 stores of <2 x double> here
   1921     // but I'm leaving that as a TODO for now.
   1922     if (!ValVT.isSimple())
   1923       return SDValue();
   1924     switch (ValVT.getSimpleVT().SimpleTy) {
   1925     default:
   1926       return SDValue();
   1927     case MVT::v2i8:
   1928     case MVT::v2i16:
   1929     case MVT::v2i32:
   1930     case MVT::v2i64:
   1931     case MVT::v2f32:
   1932     case MVT::v2f64:
   1933     case MVT::v4i8:
   1934     case MVT::v4i16:
   1935     case MVT::v4i32:
   1936     case MVT::v4f32:
   1937       // This is a "native" vector type
   1938       break;
   1939     }
   1940 
   1941     MemSDNode *MemSD = cast<MemSDNode>(N);
   1942     const DataLayout &TD = DAG.getDataLayout();
   1943 
   1944     unsigned Align = MemSD->getAlignment();
   1945     unsigned PrefAlign =
   1946         TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
   1947     if (Align < PrefAlign) {
   1948       // This store is not sufficiently aligned, so bail out and let this vector
   1949       // store be scalarized.  Note that we may still be able to emit smaller
   1950       // vector stores.  For example, if we are storing a <4 x float> with an
   1951       // alignment of 8, this check will fail but the legalizer will try again
   1952       // with 2 x <2 x float>, which will succeed with an alignment of 8.
   1953       return SDValue();
   1954     }
   1955 
   1956     unsigned Opcode = 0;
   1957     EVT EltVT = ValVT.getVectorElementType();
   1958     unsigned NumElts = ValVT.getVectorNumElements();
   1959 
   1960     // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
   1961     // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
   1962     // stored type to i16 and propagate the "real" type as the memory type.
   1963     bool NeedExt = false;
   1964     if (EltVT.getSizeInBits() < 16)
   1965       NeedExt = true;
   1966 
   1967     switch (NumElts) {
   1968     default:
   1969       return SDValue();
   1970     case 2:
   1971       Opcode = NVPTXISD::StoreV2;
   1972       break;
   1973     case 4: {
   1974       Opcode = NVPTXISD::StoreV4;
   1975       break;
   1976     }
   1977     }
   1978 
   1979     SmallVector<SDValue, 8> Ops;
   1980 
   1981     // First is the chain
   1982     Ops.push_back(N->getOperand(0));
   1983 
   1984     // Then the split values
   1985     for (unsigned i = 0; i < NumElts; ++i) {
   1986       SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
   1987                                    DAG.getIntPtrConstant(i, DL));
   1988       if (NeedExt)
   1989         ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
   1990       Ops.push_back(ExtVal);
   1991     }
   1992 
   1993     // Then any remaining arguments
   1994     Ops.append(N->op_begin() + 2, N->op_end());
   1995 
   1996     SDValue NewSt = DAG.getMemIntrinsicNode(
   1997         Opcode, DL, DAG.getVTList(MVT::Other), Ops,
   1998         MemSD->getMemoryVT(), MemSD->getMemOperand());
   1999 
   2000     //return DCI.CombineTo(N, NewSt, true);
   2001     return NewSt;
   2002   }
   2003 
   2004   return SDValue();
   2005 }
   2006 
   2007 // st i1 v, addr
   2008 //    =>
   2009 // v1 = zxt v to i16
   2010 // st.u8 i16, addr
   2011 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
   2012   SDNode *Node = Op.getNode();
   2013   SDLoc dl(Node);
   2014   StoreSDNode *ST = cast<StoreSDNode>(Node);
   2015   SDValue Tmp1 = ST->getChain();
   2016   SDValue Tmp2 = ST->getBasePtr();
   2017   SDValue Tmp3 = ST->getValue();
   2018   assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
   2019   unsigned Alignment = ST->getAlignment();
   2020   bool isVolatile = ST->isVolatile();
   2021   bool isNonTemporal = ST->isNonTemporal();
   2022   Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
   2023   SDValue Result = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2,
   2024                                      ST->getPointerInfo(), MVT::i8, isNonTemporal,
   2025                                      isVolatile, Alignment);
   2026   return Result;
   2027 }
   2028 
   2029 SDValue
   2030 NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
   2031   std::string ParamSym;
   2032   raw_string_ostream ParamStr(ParamSym);
   2033 
   2034   ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
   2035   ParamStr.flush();
   2036 
   2037   std::string *SavedStr =
   2038     nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
   2039   return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
   2040 }
   2041 
   2042 // Check to see if the kernel argument is image*_t or sampler_t
   2043 
   2044 static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
   2045   static const char *const specialTypes[] = { "struct._image2d_t",
   2046                                               "struct._image3d_t",
   2047                                               "struct._sampler_t" };
   2048 
   2049   Type *Ty = arg->getType();
   2050   auto *PTy = dyn_cast<PointerType>(Ty);
   2051 
   2052   if (!PTy)
   2053     return false;
   2054 
   2055   if (!context)
   2056     return false;
   2057 
   2058   auto *STy = dyn_cast<StructType>(PTy->getElementType());
   2059   if (!STy || STy->isLiteral())
   2060     return false;
   2061 
   2062   return std::find(std::begin(specialTypes), std::end(specialTypes),
   2063                    STy->getName()) != std::end(specialTypes);
   2064 }
   2065 
   2066 SDValue NVPTXTargetLowering::LowerFormalArguments(
   2067     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
   2068     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
   2069     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   2070   MachineFunction &MF = DAG.getMachineFunction();
   2071   const DataLayout &DL = DAG.getDataLayout();
   2072   auto PtrVT = getPointerTy(DAG.getDataLayout());
   2073 
   2074   const Function *F = MF.getFunction();
   2075   const AttributeSet &PAL = F->getAttributes();
   2076   const TargetLowering *TLI = STI.getTargetLowering();
   2077 
   2078   SDValue Root = DAG.getRoot();
   2079   std::vector<SDValue> OutChains;
   2080 
   2081   bool isKernel = llvm::isKernelFunction(*F);
   2082   bool isABI = (STI.getSmVersion() >= 20);
   2083   assert(isABI && "Non-ABI compilation is not supported");
   2084   if (!isABI)
   2085     return Chain;
   2086 
   2087   std::vector<Type *> argTypes;
   2088   std::vector<const Argument *> theArgs;
   2089   for (const Argument &I : F->args()) {
   2090     theArgs.push_back(&I);
   2091     argTypes.push_back(I.getType());
   2092   }
   2093   // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
   2094   // Ins.size() will be larger
   2095   //   * if there is an aggregate argument with multiple fields (each field
   2096   //     showing up separately in Ins)
   2097   //   * if there is a vector argument with more than typical vector-length
   2098   //     elements (generally if more than 4) where each vector element is
   2099   //     individually present in Ins.
   2100   // So a different index should be used for indexing into Ins.
   2101   // See similar issue in LowerCall.
   2102   unsigned InsIdx = 0;
   2103 
   2104   int idx = 0;
   2105   for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
   2106     Type *Ty = argTypes[i];
   2107 
   2108     // If the kernel argument is image*_t or sampler_t, convert it to
   2109     // a i32 constant holding the parameter position. This can later
   2110     // matched in the AsmPrinter to output the correct mangled name.
   2111     if (isImageOrSamplerVal(
   2112             theArgs[i],
   2113             (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
   2114                                      : nullptr))) {
   2115       assert(isKernel && "Only kernels can have image/sampler params");
   2116       InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
   2117       continue;
   2118     }
   2119 
   2120     if (theArgs[i]->use_empty()) {
   2121       // argument is dead
   2122       if (Ty->isAggregateType()) {
   2123         SmallVector<EVT, 16> vtparts;
   2124 
   2125         ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
   2126         assert(vtparts.size() > 0 && "empty aggregate type not expected");
   2127         for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
   2128              ++parti) {
   2129           InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
   2130           ++InsIdx;
   2131         }
   2132         if (vtparts.size() > 0)
   2133           --InsIdx;
   2134         continue;
   2135       }
   2136       if (Ty->isVectorTy()) {
   2137         EVT ObjectVT = getValueType(DL, Ty);
   2138         unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
   2139         for (unsigned parti = 0; parti < NumRegs; ++parti) {
   2140           InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
   2141           ++InsIdx;
   2142         }
   2143         if (NumRegs > 0)
   2144           --InsIdx;
   2145         continue;
   2146       }
   2147       InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
   2148       continue;
   2149     }
   2150 
   2151     // In the following cases, assign a node order of "idx+1"
   2152     // to newly created nodes. The SDNodes for params have to
   2153     // appear in the same order as their order of appearance
   2154     // in the original function. "idx+1" holds that order.
   2155     if (!PAL.hasAttribute(i + 1, Attribute::ByVal)) {
   2156       if (Ty->isAggregateType()) {
   2157         SmallVector<EVT, 16> vtparts;
   2158         SmallVector<uint64_t, 16> offsets;
   2159 
   2160         // NOTE: Here, we lose the ability to issue vector loads for vectors
   2161         // that are a part of a struct.  This should be investigated in the
   2162         // future.
   2163         ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &offsets,
   2164                            0);
   2165         assert(vtparts.size() > 0 && "empty aggregate type not expected");
   2166         bool aggregateIsPacked = false;
   2167         if (StructType *STy = llvm::dyn_cast<StructType>(Ty))
   2168           aggregateIsPacked = STy->isPacked();
   2169 
   2170         SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
   2171         for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
   2172              ++parti) {
   2173           EVT partVT = vtparts[parti];
   2174           Value *srcValue = Constant::getNullValue(
   2175               PointerType::get(partVT.getTypeForEVT(F->getContext()),
   2176                                llvm::ADDRESS_SPACE_PARAM));
   2177           SDValue srcAddr =
   2178               DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
   2179                           DAG.getConstant(offsets[parti], dl, PtrVT));
   2180           unsigned partAlign = aggregateIsPacked
   2181                                    ? 1
   2182                                    : DL.getABITypeAlignment(
   2183                                          partVT.getTypeForEVT(F->getContext()));
   2184           SDValue p;
   2185           if (Ins[InsIdx].VT.getSizeInBits() > partVT.getSizeInBits()) {
   2186             ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ?
   2187                                      ISD::SEXTLOAD : ISD::ZEXTLOAD;
   2188             p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, srcAddr,
   2189                                MachinePointerInfo(srcValue), partVT, false,
   2190                                false, false, partAlign);
   2191           } else {
   2192             p = DAG.getLoad(partVT, dl, Root, srcAddr,
   2193                             MachinePointerInfo(srcValue), false, false, false,
   2194                             partAlign);
   2195           }
   2196           if (p.getNode())
   2197             p.getNode()->setIROrder(idx + 1);
   2198           InVals.push_back(p);
   2199           ++InsIdx;
   2200         }
   2201         if (vtparts.size() > 0)
   2202           --InsIdx;
   2203         continue;
   2204       }
   2205       if (Ty->isVectorTy()) {
   2206         EVT ObjectVT = getValueType(DL, Ty);
   2207         SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
   2208         unsigned NumElts = ObjectVT.getVectorNumElements();
   2209         assert(TLI->getNumRegisters(F->getContext(), ObjectVT) == NumElts &&
   2210                "Vector was not scalarized");
   2211         EVT EltVT = ObjectVT.getVectorElementType();
   2212 
   2213         // V1 load
   2214         // f32 = load ...
   2215         if (NumElts == 1) {
   2216           // We only have one element, so just directly load it
   2217           Value *SrcValue = Constant::getNullValue(PointerType::get(
   2218               EltVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
   2219           SDValue P = DAG.getLoad(
   2220               EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false, false,
   2221               true,
   2222               DL.getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())));
   2223           if (P.getNode())
   2224             P.getNode()->setIROrder(idx + 1);
   2225 
   2226           if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
   2227             P = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, P);
   2228           InVals.push_back(P);
   2229           ++InsIdx;
   2230         } else if (NumElts == 2) {
   2231           // V2 load
   2232           // f32,f32 = load ...
   2233           EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, 2);
   2234           Value *SrcValue = Constant::getNullValue(PointerType::get(
   2235               VecVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
   2236           SDValue P = DAG.getLoad(
   2237               VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false, false,
   2238               true,
   2239               DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
   2240           if (P.getNode())
   2241             P.getNode()->setIROrder(idx + 1);
   2242 
   2243           SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
   2244                                      DAG.getIntPtrConstant(0, dl));
   2245           SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
   2246                                      DAG.getIntPtrConstant(1, dl));
   2247 
   2248           if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) {
   2249             Elt0 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt0);
   2250             Elt1 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt1);
   2251           }
   2252 
   2253           InVals.push_back(Elt0);
   2254           InVals.push_back(Elt1);
   2255           InsIdx += 2;
   2256         } else {
   2257           // V4 loads
   2258           // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
   2259           // the
   2260           // vector will be expanded to a power of 2 elements, so we know we can
   2261           // always round up to the next multiple of 4 when creating the vector
   2262           // loads.
   2263           // e.g.  4 elem => 1 ld.v4
   2264           //       6 elem => 2 ld.v4
   2265           //       8 elem => 2 ld.v4
   2266           //      11 elem => 3 ld.v4
   2267           unsigned VecSize = 4;
   2268           if (EltVT.getSizeInBits() == 64) {
   2269             VecSize = 2;
   2270           }
   2271           EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
   2272           unsigned Ofst = 0;
   2273           for (unsigned i = 0; i < NumElts; i += VecSize) {
   2274             Value *SrcValue = Constant::getNullValue(
   2275                 PointerType::get(VecVT.getTypeForEVT(F->getContext()),
   2276                                  llvm::ADDRESS_SPACE_PARAM));
   2277             SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
   2278                                           DAG.getConstant(Ofst, dl, PtrVT));
   2279             SDValue P = DAG.getLoad(
   2280                 VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false,
   2281                 false, true,
   2282                 DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
   2283             if (P.getNode())
   2284               P.getNode()->setIROrder(idx + 1);
   2285 
   2286             for (unsigned j = 0; j < VecSize; ++j) {
   2287               if (i + j >= NumElts)
   2288                 break;
   2289               SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
   2290                                         DAG.getIntPtrConstant(j, dl));
   2291               if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
   2292                 Elt = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt);
   2293               InVals.push_back(Elt);
   2294             }
   2295             Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
   2296           }
   2297           InsIdx += NumElts;
   2298         }
   2299 
   2300         if (NumElts > 0)
   2301           --InsIdx;
   2302         continue;
   2303       }
   2304       // A plain scalar.
   2305       EVT ObjectVT = getValueType(DL, Ty);
   2306       // If ABI, load from the param symbol
   2307       SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
   2308       Value *srcValue = Constant::getNullValue(PointerType::get(
   2309           ObjectVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
   2310       SDValue p;
   2311        if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) {
   2312         ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ?
   2313                                        ISD::SEXTLOAD : ISD::ZEXTLOAD;
   2314         p = DAG.getExtLoad(
   2315             ExtOp, dl, Ins[InsIdx].VT, Root, Arg, MachinePointerInfo(srcValue),
   2316             ObjectVT, false, false, false,
   2317             DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
   2318       } else {
   2319         p = DAG.getLoad(
   2320             Ins[InsIdx].VT, dl, Root, Arg, MachinePointerInfo(srcValue), false,
   2321             false, false,
   2322             DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
   2323       }
   2324       if (p.getNode())
   2325         p.getNode()->setIROrder(idx + 1);
   2326       InVals.push_back(p);
   2327       continue;
   2328     }
   2329 
   2330     // Param has ByVal attribute
   2331     // Return MoveParam(param symbol).
   2332     // Ideally, the param symbol can be returned directly,
   2333     // but when SDNode builder decides to use it in a CopyToReg(),
   2334     // machine instruction fails because TargetExternalSymbol
   2335     // (not lowered) is target dependent, and CopyToReg assumes
   2336     // the source is lowered.
   2337     EVT ObjectVT = getValueType(DL, Ty);
   2338     assert(ObjectVT == Ins[InsIdx].VT &&
   2339            "Ins type did not match function type");
   2340     SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
   2341     SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
   2342     if (p.getNode())
   2343       p.getNode()->setIROrder(idx + 1);
   2344     if (isKernel)
   2345       InVals.push_back(p);
   2346     else {
   2347       SDValue p2 = DAG.getNode(
   2348           ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT,
   2349           DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, dl, MVT::i32), p);
   2350       InVals.push_back(p2);
   2351     }
   2352   }
   2353 
   2354   // Clang will check explicit VarArg and issue error if any. However, Clang
   2355   // will let code with
   2356   // implicit var arg like f() pass. See bug 617733.
   2357   // We treat this case as if the arg list is empty.
   2358   // if (F.isVarArg()) {
   2359   // assert(0 && "VarArg not supported yet!");
   2360   //}
   2361 
   2362   if (!OutChains.empty())
   2363     DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
   2364 
   2365   return Chain;
   2366 }
   2367 
   2368 SDValue
   2369 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   2370                                  bool isVarArg,
   2371                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
   2372                                  const SmallVectorImpl<SDValue> &OutVals,
   2373                                  const SDLoc &dl, SelectionDAG &DAG) const {
   2374   MachineFunction &MF = DAG.getMachineFunction();
   2375   const Function *F = MF.getFunction();
   2376   Type *RetTy = F->getReturnType();
   2377   const DataLayout &TD = DAG.getDataLayout();
   2378 
   2379   bool isABI = (STI.getSmVersion() >= 20);
   2380   assert(isABI && "Non-ABI compilation is not supported");
   2381   if (!isABI)
   2382     return Chain;
   2383 
   2384   if (VectorType *VTy = dyn_cast<VectorType>(RetTy)) {
   2385     // If we have a vector type, the OutVals array will be the scalarized
   2386     // components and we have combine them into 1 or more vector stores.
   2387     unsigned NumElts = VTy->getNumElements();
   2388     assert(NumElts == Outs.size() && "Bad scalarization of return value");
   2389 
   2390     // const_cast can be removed in later LLVM versions
   2391     EVT EltVT = getValueType(TD, RetTy).getVectorElementType();
   2392     bool NeedExtend = false;
   2393     if (EltVT.getSizeInBits() < 16)
   2394       NeedExtend = true;
   2395 
   2396     // V1 store
   2397     if (NumElts == 1) {
   2398       SDValue StoreVal = OutVals[0];
   2399       // We only have one element, so just directly store it
   2400       if (NeedExtend)
   2401         StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
   2402       SDValue Ops[] = { Chain, DAG.getConstant(0, dl, MVT::i32), StoreVal };
   2403       Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
   2404                                       DAG.getVTList(MVT::Other), Ops,
   2405                                       EltVT, MachinePointerInfo());
   2406 
   2407     } else if (NumElts == 2) {
   2408       // V2 store
   2409       SDValue StoreVal0 = OutVals[0];
   2410       SDValue StoreVal1 = OutVals[1];
   2411 
   2412       if (NeedExtend) {
   2413         StoreVal0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal0);
   2414         StoreVal1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal1);
   2415       }
   2416 
   2417       SDValue Ops[] = { Chain, DAG.getConstant(0, dl, MVT::i32), StoreVal0,
   2418                         StoreVal1 };
   2419       Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetvalV2, dl,
   2420                                       DAG.getVTList(MVT::Other), Ops,
   2421                                       EltVT, MachinePointerInfo());
   2422     } else {
   2423       // V4 stores
   2424       // We have at least 4 elements (<3 x Ty> expands to 4 elements) and the
   2425       // vector will be expanded to a power of 2 elements, so we know we can
   2426       // always round up to the next multiple of 4 when creating the vector
   2427       // stores.
   2428       // e.g.  4 elem => 1 st.v4
   2429       //       6 elem => 2 st.v4
   2430       //       8 elem => 2 st.v4
   2431       //      11 elem => 3 st.v4
   2432 
   2433       unsigned VecSize = 4;
   2434       if (OutVals[0].getValueType().getSizeInBits() == 64)
   2435         VecSize = 2;
   2436 
   2437       unsigned Offset = 0;
   2438 
   2439       EVT VecVT =
   2440           EVT::getVectorVT(F->getContext(), EltVT, VecSize);
   2441       unsigned PerStoreOffset =
   2442           TD.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
   2443 
   2444       for (unsigned i = 0; i < NumElts; i += VecSize) {
   2445         // Get values
   2446         SDValue StoreVal;
   2447         SmallVector<SDValue, 8> Ops;
   2448         Ops.push_back(Chain);
   2449         Ops.push_back(DAG.getConstant(Offset, dl, MVT::i32));
   2450         unsigned Opc = NVPTXISD::StoreRetvalV2;
   2451         EVT ExtendedVT = (NeedExtend) ? MVT::i16 : OutVals[0].getValueType();
   2452 
   2453         StoreVal = OutVals[i];
   2454         if (NeedExtend)
   2455           StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
   2456         Ops.push_back(StoreVal);
   2457 
   2458         if (i + 1 < NumElts) {
   2459           StoreVal = OutVals[i + 1];
   2460           if (NeedExtend)
   2461             StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
   2462         } else {
   2463           StoreVal = DAG.getUNDEF(ExtendedVT);
   2464         }
   2465         Ops.push_back(StoreVal);
   2466 
   2467         if (VecSize == 4) {
   2468           Opc = NVPTXISD::StoreRetvalV4;
   2469           if (i + 2 < NumElts) {
   2470             StoreVal = OutVals[i + 2];
   2471             if (NeedExtend)
   2472               StoreVal =
   2473                   DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
   2474           } else {
   2475             StoreVal = DAG.getUNDEF(ExtendedVT);
   2476           }
   2477           Ops.push_back(StoreVal);
   2478 
   2479           if (i + 3 < NumElts) {
   2480             StoreVal = OutVals[i + 3];
   2481             if (NeedExtend)
   2482               StoreVal =
   2483                   DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
   2484           } else {
   2485             StoreVal = DAG.getUNDEF(ExtendedVT);
   2486           }
   2487           Ops.push_back(StoreVal);
   2488         }
   2489 
   2490         // Chain = DAG.getNode(Opc, dl, MVT::Other, &Ops[0], Ops.size());
   2491         Chain =
   2492             DAG.getMemIntrinsicNode(Opc, dl, DAG.getVTList(MVT::Other), Ops,
   2493                                     EltVT, MachinePointerInfo());
   2494         Offset += PerStoreOffset;
   2495       }
   2496     }
   2497   } else {
   2498     SmallVector<EVT, 16> ValVTs;
   2499     SmallVector<uint64_t, 16> Offsets;
   2500     ComputePTXValueVTs(*this, DAG.getDataLayout(), RetTy, ValVTs, &Offsets, 0);
   2501     assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition");
   2502 
   2503     for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
   2504       SDValue theVal = OutVals[i];
   2505       EVT TheValType = theVal.getValueType();
   2506       unsigned numElems = 1;
   2507       if (TheValType.isVector())
   2508         numElems = TheValType.getVectorNumElements();
   2509       for (unsigned j = 0, je = numElems; j != je; ++j) {
   2510         SDValue TmpVal = theVal;
   2511         if (TheValType.isVector())
   2512           TmpVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
   2513                                TheValType.getVectorElementType(), TmpVal,
   2514                                DAG.getIntPtrConstant(j, dl));
   2515         EVT TheStoreType = ValVTs[i];
   2516         if (RetTy->isIntegerTy() && TD.getTypeAllocSizeInBits(RetTy) < 32) {
   2517           // The following zero-extension is for integer types only, and
   2518           // specifically not for aggregates.
   2519           TmpVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, TmpVal);
   2520           TheStoreType = MVT::i32;
   2521         }
   2522         else if (TmpVal.getValueType().getSizeInBits() < 16)
   2523           TmpVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, TmpVal);
   2524 
   2525         SDValue Ops[] = {
   2526           Chain,
   2527           DAG.getConstant(Offsets[i], dl, MVT::i32),
   2528           TmpVal };
   2529         Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
   2530                                         DAG.getVTList(MVT::Other), Ops,
   2531                                         TheStoreType,
   2532                                         MachinePointerInfo());
   2533       }
   2534     }
   2535   }
   2536 
   2537   return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
   2538 }
   2539 
   2540 
   2541 void NVPTXTargetLowering::LowerAsmOperandForConstraint(
   2542     SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
   2543     SelectionDAG &DAG) const {
   2544   if (Constraint.length() > 1)
   2545     return;
   2546   else
   2547     TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
   2548 }
   2549 
   2550 static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
   2551   switch (Intrinsic) {
   2552   default:
   2553     return 0;
   2554 
   2555   case Intrinsic::nvvm_tex_1d_v4f32_s32:
   2556     return NVPTXISD::Tex1DFloatS32;
   2557   case Intrinsic::nvvm_tex_1d_v4f32_f32:
   2558     return NVPTXISD::Tex1DFloatFloat;
   2559   case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
   2560     return NVPTXISD::Tex1DFloatFloatLevel;
   2561   case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
   2562     return NVPTXISD::Tex1DFloatFloatGrad;
   2563   case Intrinsic::nvvm_tex_1d_v4s32_s32:
   2564     return NVPTXISD::Tex1DS32S32;
   2565   case Intrinsic::nvvm_tex_1d_v4s32_f32:
   2566     return NVPTXISD::Tex1DS32Float;
   2567   case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
   2568     return NVPTXISD::Tex1DS32FloatLevel;
   2569   case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
   2570     return NVPTXISD::Tex1DS32FloatGrad;
   2571   case Intrinsic::nvvm_tex_1d_v4u32_s32:
   2572     return NVPTXISD::Tex1DU32S32;
   2573   case Intrinsic::nvvm_tex_1d_v4u32_f32:
   2574     return NVPTXISD::Tex1DU32Float;
   2575   case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
   2576     return NVPTXISD::Tex1DU32FloatLevel;
   2577   case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
   2578     return NVPTXISD::Tex1DU32FloatGrad;
   2579 
   2580   case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
   2581     return NVPTXISD::Tex1DArrayFloatS32;
   2582   case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
   2583     return NVPTXISD::Tex1DArrayFloatFloat;
   2584   case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
   2585     return NVPTXISD::Tex1DArrayFloatFloatLevel;
   2586   case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
   2587     return NVPTXISD::Tex1DArrayFloatFloatGrad;
   2588   case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
   2589     return NVPTXISD::Tex1DArrayS32S32;
   2590   case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
   2591     return NVPTXISD::Tex1DArrayS32Float;
   2592   case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
   2593     return NVPTXISD::Tex1DArrayS32FloatLevel;
   2594   case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
   2595     return NVPTXISD::Tex1DArrayS32FloatGrad;
   2596   case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
   2597     return NVPTXISD::Tex1DArrayU32S32;
   2598   case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
   2599     return NVPTXISD::Tex1DArrayU32Float;
   2600   case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
   2601     return NVPTXISD::Tex1DArrayU32FloatLevel;
   2602   case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
   2603     return NVPTXISD::Tex1DArrayU32FloatGrad;
   2604 
   2605   case Intrinsic::nvvm_tex_2d_v4f32_s32:
   2606     return NVPTXISD::Tex2DFloatS32;
   2607   case Intrinsic::nvvm_tex_2d_v4f32_f32:
   2608     return NVPTXISD::Tex2DFloatFloat;
   2609   case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
   2610     return NVPTXISD::Tex2DFloatFloatLevel;
   2611   case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
   2612     return NVPTXISD::Tex2DFloatFloatGrad;
   2613   case Intrinsic::nvvm_tex_2d_v4s32_s32:
   2614     return NVPTXISD::Tex2DS32S32;
   2615   case Intrinsic::nvvm_tex_2d_v4s32_f32:
   2616     return NVPTXISD::Tex2DS32Float;
   2617   case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
   2618     return NVPTXISD::Tex2DS32FloatLevel;
   2619   case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
   2620     return NVPTXISD::Tex2DS32FloatGrad;
   2621   case Intrinsic::nvvm_tex_2d_v4u32_s32:
   2622     return NVPTXISD::Tex2DU32S32;
   2623   case Intrinsic::nvvm_tex_2d_v4u32_f32:
   2624     return NVPTXISD::Tex2DU32Float;
   2625   case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
   2626     return NVPTXISD::Tex2DU32FloatLevel;
   2627   case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
   2628     return NVPTXISD::Tex2DU32FloatGrad;
   2629 
   2630   case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
   2631     return NVPTXISD::Tex2DArrayFloatS32;
   2632   case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
   2633     return NVPTXISD::Tex2DArrayFloatFloat;
   2634   case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
   2635     return NVPTXISD::Tex2DArrayFloatFloatLevel;
   2636   case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
   2637     return NVPTXISD::Tex2DArrayFloatFloatGrad;
   2638   case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
   2639     return NVPTXISD::Tex2DArrayS32S32;
   2640   case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
   2641     return NVPTXISD::Tex2DArrayS32Float;
   2642   case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
   2643     return NVPTXISD::Tex2DArrayS32FloatLevel;
   2644   case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
   2645     return NVPTXISD::Tex2DArrayS32FloatGrad;
   2646   case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
   2647     return NVPTXISD::Tex2DArrayU32S32;
   2648   case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
   2649     return NVPTXISD::Tex2DArrayU32Float;
   2650   case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
   2651     return NVPTXISD::Tex2DArrayU32FloatLevel;
   2652   case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
   2653     return NVPTXISD::Tex2DArrayU32FloatGrad;
   2654 
   2655   case Intrinsic::nvvm_tex_3d_v4f32_s32:
   2656     return NVPTXISD::Tex3DFloatS32;
   2657   case Intrinsic::nvvm_tex_3d_v4f32_f32:
   2658     return NVPTXISD::Tex3DFloatFloat;
   2659   case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
   2660     return NVPTXISD::Tex3DFloatFloatLevel;
   2661   case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
   2662     return NVPTXISD::Tex3DFloatFloatGrad;
   2663   case Intrinsic::nvvm_tex_3d_v4s32_s32:
   2664     return NVPTXISD::Tex3DS32S32;
   2665   case Intrinsic::nvvm_tex_3d_v4s32_f32:
   2666     return NVPTXISD::Tex3DS32Float;
   2667   case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
   2668     return NVPTXISD::Tex3DS32FloatLevel;
   2669   case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
   2670     return NVPTXISD::Tex3DS32FloatGrad;
   2671   case Intrinsic::nvvm_tex_3d_v4u32_s32:
   2672     return NVPTXISD::Tex3DU32S32;
   2673   case Intrinsic::nvvm_tex_3d_v4u32_f32:
   2674     return NVPTXISD::Tex3DU32Float;
   2675   case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
   2676     return NVPTXISD::Tex3DU32FloatLevel;
   2677   case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
   2678     return NVPTXISD::Tex3DU32FloatGrad;
   2679 
   2680   case Intrinsic::nvvm_tex_cube_v4f32_f32:
   2681     return NVPTXISD::TexCubeFloatFloat;
   2682   case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
   2683     return NVPTXISD::TexCubeFloatFloatLevel;
   2684   case Intrinsic::nvvm_tex_cube_v4s32_f32:
   2685     return NVPTXISD::TexCubeS32Float;
   2686   case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
   2687     return NVPTXISD::TexCubeS32FloatLevel;
   2688   case Intrinsic::nvvm_tex_cube_v4u32_f32:
   2689     return NVPTXISD::TexCubeU32Float;
   2690   case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
   2691     return NVPTXISD::TexCubeU32FloatLevel;
   2692 
   2693   case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
   2694     return NVPTXISD::TexCubeArrayFloatFloat;
   2695   case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
   2696     return NVPTXISD::TexCubeArrayFloatFloatLevel;
   2697   case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
   2698     return NVPTXISD::TexCubeArrayS32Float;
   2699   case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
   2700     return NVPTXISD::TexCubeArrayS32FloatLevel;
   2701   case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
   2702     return NVPTXISD::TexCubeArrayU32Float;
   2703   case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
   2704     return NVPTXISD::TexCubeArrayU32FloatLevel;
   2705 
   2706   case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
   2707     return NVPTXISD::Tld4R2DFloatFloat;
   2708   case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
   2709     return NVPTXISD::Tld4G2DFloatFloat;
   2710   case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
   2711     return NVPTXISD::Tld4B2DFloatFloat;
   2712   case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
   2713     return NVPTXISD::Tld4A2DFloatFloat;
   2714   case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
   2715     return NVPTXISD::Tld4R2DS64Float;
   2716   case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
   2717     return NVPTXISD::Tld4G2DS64Float;
   2718   case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
   2719     return NVPTXISD::Tld4B2DS64Float;
   2720   case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
   2721     return NVPTXISD::Tld4A2DS64Float;
   2722   case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
   2723     return NVPTXISD::Tld4R2DU64Float;
   2724   case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
   2725     return NVPTXISD::Tld4G2DU64Float;
   2726   case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
   2727     return NVPTXISD::Tld4B2DU64Float;
   2728   case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
   2729     return NVPTXISD::Tld4A2DU64Float;
   2730 
   2731   case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
   2732     return NVPTXISD::TexUnified1DFloatS32;
   2733   case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
   2734     return NVPTXISD::TexUnified1DFloatFloat;
   2735   case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
   2736     return NVPTXISD::TexUnified1DFloatFloatLevel;
   2737   case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
   2738     return NVPTXISD::TexUnified1DFloatFloatGrad;
   2739   case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
   2740     return NVPTXISD::TexUnified1DS32S32;
   2741   case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
   2742     return NVPTXISD::TexUnified1DS32Float;
   2743   case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
   2744     return NVPTXISD::TexUnified1DS32FloatLevel;
   2745   case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
   2746     return NVPTXISD::TexUnified1DS32FloatGrad;
   2747   case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
   2748     return NVPTXISD::TexUnified1DU32S32;
   2749   case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
   2750     return NVPTXISD::TexUnified1DU32Float;
   2751   case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
   2752     return NVPTXISD::TexUnified1DU32FloatLevel;
   2753   case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
   2754     return NVPTXISD::TexUnified1DU32FloatGrad;
   2755 
   2756   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
   2757     return NVPTXISD::TexUnified1DArrayFloatS32;
   2758   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
   2759     return NVPTXISD::TexUnified1DArrayFloatFloat;
   2760   case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
   2761     return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
   2762   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
   2763     return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
   2764   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
   2765     return NVPTXISD::TexUnified1DArrayS32S32;
   2766   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
   2767     return NVPTXISD::TexUnified1DArrayS32Float;
   2768   case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
   2769     return NVPTXISD::TexUnified1DArrayS32FloatLevel;
   2770   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
   2771     return NVPTXISD::TexUnified1DArrayS32FloatGrad;
   2772   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
   2773     return NVPTXISD::TexUnified1DArrayU32S32;
   2774   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
   2775     return NVPTXISD::TexUnified1DArrayU32Float;
   2776   case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
   2777     return NVPTXISD::TexUnified1DArrayU32FloatLevel;
   2778   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
   2779     return NVPTXISD::TexUnified1DArrayU32FloatGrad;
   2780 
   2781   case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
   2782     return NVPTXISD::TexUnified2DFloatS32;
   2783   case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
   2784     return NVPTXISD::TexUnified2DFloatFloat;
   2785   case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
   2786     return NVPTXISD::TexUnified2DFloatFloatLevel;
   2787   case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
   2788     return NVPTXISD::TexUnified2DFloatFloatGrad;
   2789   case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
   2790     return NVPTXISD::TexUnified2DS32S32;
   2791   case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
   2792     return NVPTXISD::TexUnified2DS32Float;
   2793   case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
   2794     return NVPTXISD::TexUnified2DS32FloatLevel;
   2795   case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
   2796     return NVPTXISD::TexUnified2DS32FloatGrad;
   2797   case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
   2798     return NVPTXISD::TexUnified2DU32S32;
   2799   case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
   2800     return NVPTXISD::TexUnified2DU32Float;
   2801   case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
   2802     return NVPTXISD::TexUnified2DU32FloatLevel;
   2803   case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
   2804     return NVPTXISD::TexUnified2DU32FloatGrad;
   2805 
   2806   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
   2807     return NVPTXISD::TexUnified2DArrayFloatS32;
   2808   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
   2809     return NVPTXISD::TexUnified2DArrayFloatFloat;
   2810   case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
   2811     return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
   2812   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
   2813     return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
   2814   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
   2815     return NVPTXISD::TexUnified2DArrayS32S32;
   2816   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
   2817     return NVPTXISD::TexUnified2DArrayS32Float;
   2818   case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
   2819     return NVPTXISD::TexUnified2DArrayS32FloatLevel;
   2820   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
   2821     return NVPTXISD::TexUnified2DArrayS32FloatGrad;
   2822   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
   2823     return NVPTXISD::TexUnified2DArrayU32S32;
   2824   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
   2825     return NVPTXISD::TexUnified2DArrayU32Float;
   2826   case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
   2827     return NVPTXISD::TexUnified2DArrayU32FloatLevel;
   2828   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
   2829     return NVPTXISD::TexUnified2DArrayU32FloatGrad;
   2830 
   2831   case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
   2832     return NVPTXISD::TexUnified3DFloatS32;
   2833   case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
   2834     return NVPTXISD::TexUnified3DFloatFloat;
   2835   case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
   2836     return NVPTXISD::TexUnified3DFloatFloatLevel;
   2837   case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
   2838     return NVPTXISD::TexUnified3DFloatFloatGrad;
   2839   case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
   2840     return NVPTXISD::TexUnified3DS32S32;
   2841   case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
   2842     return NVPTXISD::TexUnified3DS32Float;
   2843   case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
   2844     return NVPTXISD::TexUnified3DS32FloatLevel;
   2845   case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
   2846     return NVPTXISD::TexUnified3DS32FloatGrad;
   2847   case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
   2848     return NVPTXISD::TexUnified3DU32S32;
   2849   case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
   2850     return NVPTXISD::TexUnified3DU32Float;
   2851   case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
   2852     return NVPTXISD::TexUnified3DU32FloatLevel;
   2853   case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
   2854     return NVPTXISD::TexUnified3DU32FloatGrad;
   2855 
   2856   case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
   2857     return NVPTXISD::TexUnifiedCubeFloatFloat;
   2858   case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
   2859     return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
   2860   case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
   2861     return NVPTXISD::TexUnifiedCubeS32Float;
   2862   case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
   2863     return NVPTXISD::TexUnifiedCubeS32FloatLevel;
   2864   case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
   2865     return NVPTXISD::TexUnifiedCubeU32Float;
   2866   case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
   2867     return NVPTXISD::TexUnifiedCubeU32FloatLevel;
   2868 
   2869   case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
   2870     return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
   2871   case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
   2872     return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
   2873   case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
   2874     return NVPTXISD::TexUnifiedCubeArrayS32Float;
   2875   case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
   2876     return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
   2877   case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
   2878     return NVPTXISD::TexUnifiedCubeArrayU32Float;
   2879   case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
   2880     return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
   2881 
   2882   case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
   2883     return NVPTXISD::Tld4UnifiedR2DFloatFloat;
   2884   case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
   2885     return NVPTXISD::Tld4UnifiedG2DFloatFloat;
   2886   case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
   2887     return NVPTXISD::Tld4UnifiedB2DFloatFloat;
   2888   case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
   2889     return NVPTXISD::Tld4UnifiedA2DFloatFloat;
   2890   case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
   2891     return NVPTXISD::Tld4UnifiedR2DS64Float;
   2892   case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
   2893     return NVPTXISD::Tld4UnifiedG2DS64Float;
   2894   case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
   2895     return NVPTXISD::Tld4UnifiedB2DS64Float;
   2896   case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
   2897     return NVPTXISD::Tld4UnifiedA2DS64Float;
   2898   case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
   2899     return NVPTXISD::Tld4UnifiedR2DU64Float;
   2900   case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
   2901     return NVPTXISD::Tld4UnifiedG2DU64Float;
   2902   case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
   2903     return NVPTXISD::Tld4UnifiedB2DU64Float;
   2904   case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
   2905     return NVPTXISD::Tld4UnifiedA2DU64Float;
   2906   }
   2907 }
   2908 
   2909 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
   2910   switch (Intrinsic) {
   2911   default:
   2912     return 0;
   2913   case Intrinsic::nvvm_suld_1d_i8_clamp:
   2914     return NVPTXISD::Suld1DI8Clamp;
   2915   case Intrinsic::nvvm_suld_1d_i16_clamp:
   2916     return NVPTXISD::Suld1DI16Clamp;
   2917   case Intrinsic::nvvm_suld_1d_i32_clamp:
   2918     return NVPTXISD::Suld1DI32Clamp;
   2919   case Intrinsic::nvvm_suld_1d_i64_clamp:
   2920     return NVPTXISD::Suld1DI64Clamp;
   2921   case Intrinsic::nvvm_suld_1d_v2i8_clamp:
   2922     return NVPTXISD::Suld1DV2I8Clamp;
   2923   case Intrinsic::nvvm_suld_1d_v2i16_clamp:
   2924     return NVPTXISD::Suld1DV2I16Clamp;
   2925   case Intrinsic::nvvm_suld_1d_v2i32_clamp:
   2926     return NVPTXISD::Suld1DV2I32Clamp;
   2927   case Intrinsic::nvvm_suld_1d_v2i64_clamp:
   2928     return NVPTXISD::Suld1DV2I64Clamp;
   2929   case Intrinsic::nvvm_suld_1d_v4i8_clamp:
   2930     return NVPTXISD::Suld1DV4I8Clamp;
   2931   case Intrinsic::nvvm_suld_1d_v4i16_clamp:
   2932     return NVPTXISD::Suld1DV4I16Clamp;
   2933   case Intrinsic::nvvm_suld_1d_v4i32_clamp:
   2934     return NVPTXISD::Suld1DV4I32Clamp;
   2935   case Intrinsic::nvvm_suld_1d_array_i8_clamp:
   2936     return NVPTXISD::Suld1DArrayI8Clamp;
   2937   case Intrinsic::nvvm_suld_1d_array_i16_clamp:
   2938     return NVPTXISD::Suld1DArrayI16Clamp;
   2939   case Intrinsic::nvvm_suld_1d_array_i32_clamp:
   2940     return NVPTXISD::Suld1DArrayI32Clamp;
   2941   case Intrinsic::nvvm_suld_1d_array_i64_clamp:
   2942     return NVPTXISD::Suld1DArrayI64Clamp;
   2943   case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
   2944     return NVPTXISD::Suld1DArrayV2I8Clamp;
   2945   case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
   2946     return NVPTXISD::Suld1DArrayV2I16Clamp;
   2947   case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
   2948     return NVPTXISD::Suld1DArrayV2I32Clamp;
   2949   case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
   2950     return NVPTXISD::Suld1DArrayV2I64Clamp;
   2951   case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
   2952     return NVPTXISD::Suld1DArrayV4I8Clamp;
   2953   case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
   2954     return NVPTXISD::Suld1DArrayV4I16Clamp;
   2955   case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
   2956     return NVPTXISD::Suld1DArrayV4I32Clamp;
   2957   case Intrinsic::nvvm_suld_2d_i8_clamp:
   2958     return NVPTXISD::Suld2DI8Clamp;
   2959   case Intrinsic::nvvm_suld_2d_i16_clamp:
   2960     return NVPTXISD::Suld2DI16Clamp;
   2961   case Intrinsic::nvvm_suld_2d_i32_clamp:
   2962     return NVPTXISD::Suld2DI32Clamp;
   2963   case Intrinsic::nvvm_suld_2d_i64_clamp:
   2964     return NVPTXISD::Suld2DI64Clamp;
   2965   case Intrinsic::nvvm_suld_2d_v2i8_clamp:
   2966     return NVPTXISD::Suld2DV2I8Clamp;
   2967   case Intrinsic::nvvm_suld_2d_v2i16_clamp:
   2968     return NVPTXISD::Suld2DV2I16Clamp;
   2969   case Intrinsic::nvvm_suld_2d_v2i32_clamp:
   2970     return NVPTXISD::Suld2DV2I32Clamp;
   2971   case Intrinsic::nvvm_suld_2d_v2i64_clamp:
   2972     return NVPTXISD::Suld2DV2I64Clamp;
   2973   case Intrinsic::nvvm_suld_2d_v4i8_clamp:
   2974     return NVPTXISD::Suld2DV4I8Clamp;
   2975   case Intrinsic::nvvm_suld_2d_v4i16_clamp:
   2976     return NVPTXISD::Suld2DV4I16Clamp;
   2977   case Intrinsic::nvvm_suld_2d_v4i32_clamp:
   2978     return NVPTXISD::Suld2DV4I32Clamp;
   2979   case Intrinsic::nvvm_suld_2d_array_i8_clamp:
   2980     return NVPTXISD::Suld2DArrayI8Clamp;
   2981   case Intrinsic::nvvm_suld_2d_array_i16_clamp:
   2982     return NVPTXISD::Suld2DArrayI16Clamp;
   2983   case Intrinsic::nvvm_suld_2d_array_i32_clamp:
   2984     return NVPTXISD::Suld2DArrayI32Clamp;
   2985   case Intrinsic::nvvm_suld_2d_array_i64_clamp:
   2986     return NVPTXISD::Suld2DArrayI64Clamp;
   2987   case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
   2988     return NVPTXISD::Suld2DArrayV2I8Clamp;
   2989   case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
   2990     return NVPTXISD::Suld2DArrayV2I16Clamp;
   2991   case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
   2992     return NVPTXISD::Suld2DArrayV2I32Clamp;
   2993   case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
   2994     return NVPTXISD::Suld2DArrayV2I64Clamp;
   2995   case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
   2996     return NVPTXISD::Suld2DArrayV4I8Clamp;
   2997   case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
   2998     return NVPTXISD::Suld2DArrayV4I16Clamp;
   2999   case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
   3000     return NVPTXISD::Suld2DArrayV4I32Clamp;
   3001   case Intrinsic::nvvm_suld_3d_i8_clamp:
   3002     return NVPTXISD::Suld3DI8Clamp;
   3003   case Intrinsic::nvvm_suld_3d_i16_clamp:
   3004     return NVPTXISD::Suld3DI16Clamp;
   3005   case Intrinsic::nvvm_suld_3d_i32_clamp:
   3006     return NVPTXISD::Suld3DI32Clamp;
   3007   case Intrinsic::nvvm_suld_3d_i64_clamp:
   3008     return NVPTXISD::Suld3DI64Clamp;
   3009   case Intrinsic::nvvm_suld_3d_v2i8_clamp:
   3010     return NVPTXISD::Suld3DV2I8Clamp;
   3011   case Intrinsic::nvvm_suld_3d_v2i16_clamp:
   3012     return NVPTXISD::Suld3DV2I16Clamp;
   3013   case Intrinsic::nvvm_suld_3d_v2i32_clamp:
   3014     return NVPTXISD::Suld3DV2I32Clamp;
   3015   case Intrinsic::nvvm_suld_3d_v2i64_clamp:
   3016     return NVPTXISD::Suld3DV2I64Clamp;
   3017   case Intrinsic::nvvm_suld_3d_v4i8_clamp:
   3018     return NVPTXISD::Suld3DV4I8Clamp;
   3019   case Intrinsic::nvvm_suld_3d_v4i16_clamp:
   3020     return NVPTXISD::Suld3DV4I16Clamp;
   3021   case Intrinsic::nvvm_suld_3d_v4i32_clamp:
   3022     return NVPTXISD::Suld3DV4I32Clamp;
   3023   case Intrinsic::nvvm_suld_1d_i8_trap:
   3024     return NVPTXISD::Suld1DI8Trap;
   3025   case Intrinsic::nvvm_suld_1d_i16_trap:
   3026     return NVPTXISD::Suld1DI16Trap;
   3027   case Intrinsic::nvvm_suld_1d_i32_trap:
   3028     return NVPTXISD::Suld1DI32Trap;
   3029   case Intrinsic::nvvm_suld_1d_i64_trap:
   3030     return NVPTXISD::Suld1DI64Trap;
   3031   case Intrinsic::nvvm_suld_1d_v2i8_trap:
   3032     return NVPTXISD::Suld1DV2I8Trap;
   3033   case Intrinsic::nvvm_suld_1d_v2i16_trap:
   3034     return NVPTXISD::Suld1DV2I16Trap;
   3035   case Intrinsic::nvvm_suld_1d_v2i32_trap:
   3036     return NVPTXISD::Suld1DV2I32Trap;
   3037   case Intrinsic::nvvm_suld_1d_v2i64_trap:
   3038     return NVPTXISD::Suld1DV2I64Trap;
   3039   case Intrinsic::nvvm_suld_1d_v4i8_trap:
   3040     return NVPTXISD::Suld1DV4I8Trap;
   3041   case Intrinsic::nvvm_suld_1d_v4i16_trap:
   3042     return NVPTXISD::Suld1DV4I16Trap;
   3043   case Intrinsic::nvvm_suld_1d_v4i32_trap:
   3044     return NVPTXISD::Suld1DV4I32Trap;
   3045   case Intrinsic::nvvm_suld_1d_array_i8_trap:
   3046     return NVPTXISD::Suld1DArrayI8Trap;
   3047   case Intrinsic::nvvm_suld_1d_array_i16_trap:
   3048     return NVPTXISD::Suld1DArrayI16Trap;
   3049   case Intrinsic::nvvm_suld_1d_array_i32_trap:
   3050     return NVPTXISD::Suld1DArrayI32Trap;
   3051   case Intrinsic::nvvm_suld_1d_array_i64_trap:
   3052     return NVPTXISD::Suld1DArrayI64Trap;
   3053   case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
   3054     return NVPTXISD::Suld1DArrayV2I8Trap;
   3055   case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
   3056     return NVPTXISD::Suld1DArrayV2I16Trap;
   3057   case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
   3058     return NVPTXISD::Suld1DArrayV2I32Trap;
   3059   case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
   3060     return NVPTXISD::Suld1DArrayV2I64Trap;
   3061   case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
   3062     return NVPTXISD::Suld1DArrayV4I8Trap;
   3063   case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
   3064     return NVPTXISD::Suld1DArrayV4I16Trap;
   3065   case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
   3066     return NVPTXISD::Suld1DArrayV4I32Trap;
   3067   case Intrinsic::nvvm_suld_2d_i8_trap:
   3068     return NVPTXISD::Suld2DI8Trap;
   3069   case Intrinsic::nvvm_suld_2d_i16_trap:
   3070     return NVPTXISD::Suld2DI16Trap;
   3071   case Intrinsic::nvvm_suld_2d_i32_trap:
   3072     return NVPTXISD::Suld2DI32Trap;
   3073   case Intrinsic::nvvm_suld_2d_i64_trap:
   3074     return NVPTXISD::Suld2DI64Trap;
   3075   case Intrinsic::nvvm_suld_2d_v2i8_trap:
   3076     return NVPTXISD::Suld2DV2I8Trap;
   3077   case Intrinsic::nvvm_suld_2d_v2i16_trap:
   3078     return NVPTXISD::Suld2DV2I16Trap;
   3079   case Intrinsic::nvvm_suld_2d_v2i32_trap:
   3080     return NVPTXISD::Suld2DV2I32Trap;
   3081   case Intrinsic::nvvm_suld_2d_v2i64_trap:
   3082     return NVPTXISD::Suld2DV2I64Trap;
   3083   case Intrinsic::nvvm_suld_2d_v4i8_trap:
   3084     return NVPTXISD::Suld2DV4I8Trap;
   3085   case Intrinsic::nvvm_suld_2d_v4i16_trap:
   3086     return NVPTXISD::Suld2DV4I16Trap;
   3087   case Intrinsic::nvvm_suld_2d_v4i32_trap:
   3088     return NVPTXISD::Suld2DV4I32Trap;
   3089   case Intrinsic::nvvm_suld_2d_array_i8_trap:
   3090     return NVPTXISD::Suld2DArrayI8Trap;
   3091   case Intrinsic::nvvm_suld_2d_array_i16_trap:
   3092     return NVPTXISD::Suld2DArrayI16Trap;
   3093   case Intrinsic::nvvm_suld_2d_array_i32_trap:
   3094     return NVPTXISD::Suld2DArrayI32Trap;
   3095   case Intrinsic::nvvm_suld_2d_array_i64_trap:
   3096     return NVPTXISD::Suld2DArrayI64Trap;
   3097   case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
   3098     return NVPTXISD::Suld2DArrayV2I8Trap;
   3099   case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
   3100     return NVPTXISD::Suld2DArrayV2I16Trap;
   3101   case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
   3102     return NVPTXISD::Suld2DArrayV2I32Trap;
   3103   case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
   3104     return NVPTXISD::Suld2DArrayV2I64Trap;
   3105   case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
   3106     return NVPTXISD::Suld2DArrayV4I8Trap;
   3107   case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
   3108     return NVPTXISD::Suld2DArrayV4I16Trap;
   3109   case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
   3110     return NVPTXISD::Suld2DArrayV4I32Trap;
   3111   case Intrinsic::nvvm_suld_3d_i8_trap:
   3112     return NVPTXISD::Suld3DI8Trap;
   3113   case Intrinsic::nvvm_suld_3d_i16_trap:
   3114     return NVPTXISD::Suld3DI16Trap;
   3115   case Intrinsic::nvvm_suld_3d_i32_trap:
   3116     return NVPTXISD::Suld3DI32Trap;
   3117   case Intrinsic::nvvm_suld_3d_i64_trap:
   3118     return NVPTXISD::Suld3DI64Trap;
   3119   case Intrinsic::nvvm_suld_3d_v2i8_trap:
   3120     return NVPTXISD::Suld3DV2I8Trap;
   3121   case Intrinsic::nvvm_suld_3d_v2i16_trap:
   3122     return NVPTXISD::Suld3DV2I16Trap;
   3123   case Intrinsic::nvvm_suld_3d_v2i32_trap:
   3124     return NVPTXISD::Suld3DV2I32Trap;
   3125   case Intrinsic::nvvm_suld_3d_v2i64_trap:
   3126     return NVPTXISD::Suld3DV2I64Trap;
   3127   case Intrinsic::nvvm_suld_3d_v4i8_trap:
   3128     return NVPTXISD::Suld3DV4I8Trap;
   3129   case Intrinsic::nvvm_suld_3d_v4i16_trap:
   3130     return NVPTXISD::Suld3DV4I16Trap;
   3131   case Intrinsic::nvvm_suld_3d_v4i32_trap:
   3132     return NVPTXISD::Suld3DV4I32Trap;
   3133   case Intrinsic::nvvm_suld_1d_i8_zero:
   3134     return NVPTXISD::Suld1DI8Zero;
   3135   case Intrinsic::nvvm_suld_1d_i16_zero:
   3136     return NVPTXISD::Suld1DI16Zero;
   3137   case Intrinsic::nvvm_suld_1d_i32_zero:
   3138     return NVPTXISD::Suld1DI32Zero;
   3139   case Intrinsic::nvvm_suld_1d_i64_zero:
   3140     return NVPTXISD::Suld1DI64Zero;
   3141   case Intrinsic::nvvm_suld_1d_v2i8_zero:
   3142     return NVPTXISD::Suld1DV2I8Zero;
   3143   case Intrinsic::nvvm_suld_1d_v2i16_zero:
   3144     return NVPTXISD::Suld1DV2I16Zero;
   3145   case Intrinsic::nvvm_suld_1d_v2i32_zero:
   3146     return NVPTXISD::Suld1DV2I32Zero;
   3147   case Intrinsic::nvvm_suld_1d_v2i64_zero:
   3148     return NVPTXISD::Suld1DV2I64Zero;
   3149   case Intrinsic::nvvm_suld_1d_v4i8_zero:
   3150     return NVPTXISD::Suld1DV4I8Zero;
   3151   case Intrinsic::nvvm_suld_1d_v4i16_zero:
   3152     return NVPTXISD::Suld1DV4I16Zero;
   3153   case Intrinsic::nvvm_suld_1d_v4i32_zero:
   3154     return NVPTXISD::Suld1DV4I32Zero;
   3155   case Intrinsic::nvvm_suld_1d_array_i8_zero:
   3156     return NVPTXISD::Suld1DArrayI8Zero;
   3157   case Intrinsic::nvvm_suld_1d_array_i16_zero:
   3158     return NVPTXISD::Suld1DArrayI16Zero;
   3159   case Intrinsic::nvvm_suld_1d_array_i32_zero:
   3160     return NVPTXISD::Suld1DArrayI32Zero;
   3161   case Intrinsic::nvvm_suld_1d_array_i64_zero:
   3162     return NVPTXISD::Suld1DArrayI64Zero;
   3163   case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
   3164     return NVPTXISD::Suld1DArrayV2I8Zero;
   3165   case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
   3166     return NVPTXISD::Suld1DArrayV2I16Zero;
   3167   case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
   3168     return NVPTXISD::Suld1DArrayV2I32Zero;
   3169   case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
   3170     return NVPTXISD::Suld1DArrayV2I64Zero;
   3171   case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
   3172     return NVPTXISD::Suld1DArrayV4I8Zero;
   3173   case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
   3174     return NVPTXISD::Suld1DArrayV4I16Zero;
   3175   case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
   3176     return NVPTXISD::Suld1DArrayV4I32Zero;
   3177   case Intrinsic::nvvm_suld_2d_i8_zero:
   3178     return NVPTXISD::Suld2DI8Zero;
   3179   case Intrinsic::nvvm_suld_2d_i16_zero:
   3180     return NVPTXISD::Suld2DI16Zero;
   3181   case Intrinsic::nvvm_suld_2d_i32_zero:
   3182     return NVPTXISD::Suld2DI32Zero;
   3183   case Intrinsic::nvvm_suld_2d_i64_zero:
   3184     return NVPTXISD::Suld2DI64Zero;
   3185   case Intrinsic::nvvm_suld_2d_v2i8_zero:
   3186     return NVPTXISD::Suld2DV2I8Zero;
   3187   case Intrinsic::nvvm_suld_2d_v2i16_zero:
   3188     return NVPTXISD::Suld2DV2I16Zero;
   3189   case Intrinsic::nvvm_suld_2d_v2i32_zero:
   3190     return NVPTXISD::Suld2DV2I32Zero;
   3191   case Intrinsic::nvvm_suld_2d_v2i64_zero:
   3192     return NVPTXISD::Suld2DV2I64Zero;
   3193   case Intrinsic::nvvm_suld_2d_v4i8_zero:
   3194     return NVPTXISD::Suld2DV4I8Zero;
   3195   case Intrinsic::nvvm_suld_2d_v4i16_zero:
   3196     return NVPTXISD::Suld2DV4I16Zero;
   3197   case Intrinsic::nvvm_suld_2d_v4i32_zero:
   3198     return NVPTXISD::Suld2DV4I32Zero;
   3199   case Intrinsic::nvvm_suld_2d_array_i8_zero:
   3200     return NVPTXISD::Suld2DArrayI8Zero;
   3201   case Intrinsic::nvvm_suld_2d_array_i16_zero:
   3202     return NVPTXISD::Suld2DArrayI16Zero;
   3203   case Intrinsic::nvvm_suld_2d_array_i32_zero:
   3204     return NVPTXISD::Suld2DArrayI32Zero;
   3205   case Intrinsic::nvvm_suld_2d_array_i64_zero:
   3206     return NVPTXISD::Suld2DArrayI64Zero;
   3207   case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
   3208     return NVPTXISD::Suld2DArrayV2I8Zero;
   3209   case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
   3210     return NVPTXISD::Suld2DArrayV2I16Zero;
   3211   case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
   3212     return NVPTXISD::Suld2DArrayV2I32Zero;
   3213   case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
   3214     return NVPTXISD::Suld2DArrayV2I64Zero;
   3215   case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
   3216     return NVPTXISD::Suld2DArrayV4I8Zero;
   3217   case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
   3218     return NVPTXISD::Suld2DArrayV4I16Zero;
   3219   case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
   3220     return NVPTXISD::Suld2DArrayV4I32Zero;
   3221   case Intrinsic::nvvm_suld_3d_i8_zero:
   3222     return NVPTXISD::Suld3DI8Zero;
   3223   case Intrinsic::nvvm_suld_3d_i16_zero:
   3224     return NVPTXISD::Suld3DI16Zero;
   3225   case Intrinsic::nvvm_suld_3d_i32_zero:
   3226     return NVPTXISD::Suld3DI32Zero;
   3227   case Intrinsic::nvvm_suld_3d_i64_zero:
   3228     return NVPTXISD::Suld3DI64Zero;
   3229   case Intrinsic::nvvm_suld_3d_v2i8_zero:
   3230     return NVPTXISD::Suld3DV2I8Zero;
   3231   case Intrinsic::nvvm_suld_3d_v2i16_zero:
   3232     return NVPTXISD::Suld3DV2I16Zero;
   3233   case Intrinsic::nvvm_suld_3d_v2i32_zero:
   3234     return NVPTXISD::Suld3DV2I32Zero;
   3235   case Intrinsic::nvvm_suld_3d_v2i64_zero:
   3236     return NVPTXISD::Suld3DV2I64Zero;
   3237   case Intrinsic::nvvm_suld_3d_v4i8_zero:
   3238     return NVPTXISD::Suld3DV4I8Zero;
   3239   case Intrinsic::nvvm_suld_3d_v4i16_zero:
   3240     return NVPTXISD::Suld3DV4I16Zero;
   3241   case Intrinsic::nvvm_suld_3d_v4i32_zero:
   3242     return NVPTXISD::Suld3DV4I32Zero;
   3243   }
   3244 }
   3245 
   3246 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
   3247 // TgtMemIntrinsic
   3248 // because we need the information that is only available in the "Value" type
   3249 // of destination
   3250 // pointer. In particular, the address space information.
   3251 bool NVPTXTargetLowering::getTgtMemIntrinsic(
   3252     IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const {
   3253   switch (Intrinsic) {
   3254   default:
   3255     return false;
   3256 
   3257   case Intrinsic::nvvm_atomic_load_add_f32:
   3258     Info.opc = ISD::INTRINSIC_W_CHAIN;
   3259     Info.memVT = MVT::f32;
   3260     Info.ptrVal = I.getArgOperand(0);
   3261     Info.offset = 0;
   3262     Info.vol = 0;
   3263     Info.readMem = true;
   3264     Info.writeMem = true;
   3265     Info.align = 0;
   3266     return true;
   3267 
   3268   case Intrinsic::nvvm_atomic_load_inc_32:
   3269   case Intrinsic::nvvm_atomic_load_dec_32:
   3270     Info.opc = ISD::INTRINSIC_W_CHAIN;
   3271     Info.memVT = MVT::i32;
   3272     Info.ptrVal = I.getArgOperand(0);
   3273     Info.offset = 0;
   3274     Info.vol = 0;
   3275     Info.readMem = true;
   3276     Info.writeMem = true;
   3277     Info.align = 0;
   3278     return true;
   3279 
   3280   case Intrinsic::nvvm_ldu_global_i:
   3281   case Intrinsic::nvvm_ldu_global_f:
   3282   case Intrinsic::nvvm_ldu_global_p: {
   3283     auto &DL = I.getModule()->getDataLayout();
   3284     Info.opc = ISD::INTRINSIC_W_CHAIN;
   3285     if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
   3286       Info.memVT = getValueType(DL, I.getType());
   3287     else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
   3288       Info.memVT = getPointerTy(DL);
   3289     else
   3290       Info.memVT = getValueType(DL, I.getType());
   3291     Info.ptrVal = I.getArgOperand(0);
   3292     Info.offset = 0;
   3293     Info.vol = 0;
   3294     Info.readMem = true;
   3295     Info.writeMem = false;
   3296     Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
   3297 
   3298     return true;
   3299   }
   3300   case Intrinsic::nvvm_ldg_global_i:
   3301   case Intrinsic::nvvm_ldg_global_f:
   3302   case Intrinsic::nvvm_ldg_global_p: {
   3303     auto &DL = I.getModule()->getDataLayout();
   3304 
   3305     Info.opc = ISD::INTRINSIC_W_CHAIN;
   3306     if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
   3307       Info.memVT = getValueType(DL, I.getType());
   3308     else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
   3309       Info.memVT = getPointerTy(DL);
   3310     else
   3311       Info.memVT = getValueType(DL, I.getType());
   3312     Info.ptrVal = I.getArgOperand(0);
   3313     Info.offset = 0;
   3314     Info.vol = 0;
   3315     Info.readMem = true;
   3316     Info.writeMem = false;
   3317     Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
   3318 
   3319     return true;
   3320   }
   3321 
   3322   case Intrinsic::nvvm_tex_1d_v4f32_s32:
   3323   case Intrinsic::nvvm_tex_1d_v4f32_f32:
   3324   case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
   3325   case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
   3326   case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
   3327   case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
   3328   case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
   3329   case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
   3330   case Intrinsic::nvvm_tex_2d_v4f32_s32:
   3331   case Intrinsic::nvvm_tex_2d_v4f32_f32:
   3332   case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
   3333   case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
   3334   case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
   3335   case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
   3336   case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
   3337   case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
   3338   case Intrinsic::nvvm_tex_3d_v4f32_s32:
   3339   case Intrinsic::nvvm_tex_3d_v4f32_f32:
   3340   case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
   3341   case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
   3342   case Intrinsic::nvvm_tex_cube_v4f32_f32:
   3343   case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
   3344   case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
   3345   case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
   3346   case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
   3347   case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
   3348   case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
   3349   case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
   3350   case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
   3351   case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
   3352   case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
   3353   case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
   3354   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
   3355   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
   3356   case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
   3357   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
   3358   case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
   3359   case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
   3360   case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
   3361   case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
   3362   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
   3363   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
   3364   case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
   3365   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
   3366   case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
   3367   case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
   3368   case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
   3369   case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
   3370   case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
   3371   case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
   3372   case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
   3373   case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
   3374   case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
   3375   case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
   3376   case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
   3377   case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: {
   3378     Info.opc = getOpcForTextureInstr(Intrinsic);
   3379     Info.memVT = MVT::v4f32;
   3380     Info.ptrVal = nullptr;
   3381     Info.offset = 0;
   3382     Info.vol = 0;
   3383     Info.readMem = true;
   3384     Info.writeMem = false;
   3385     Info.align = 16;
   3386     return true;
   3387   }
   3388   case Intrinsic::nvvm_tex_1d_v4s32_s32:
   3389   case Intrinsic::nvvm_tex_1d_v4s32_f32:
   3390   case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
   3391   case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
   3392   case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
   3393   case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
   3394   case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
   3395   case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
   3396   case Intrinsic::nvvm_tex_2d_v4s32_s32:
   3397   case Intrinsic::nvvm_tex_2d_v4s32_f32:
   3398   case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
   3399   case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
   3400   case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
   3401   case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
   3402   case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
   3403   case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
   3404   case Intrinsic::nvvm_tex_3d_v4s32_s32:
   3405   case Intrinsic::nvvm_tex_3d_v4s32_f32:
   3406   case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
   3407   case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
   3408   case Intrinsic::nvvm_tex_cube_v4s32_f32:
   3409   case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
   3410   case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
   3411   case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
   3412   case Intrinsic::nvvm_tex_cube_v4u32_f32:
   3413   case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
   3414   case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
   3415   case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
   3416   case Intrinsic::nvvm_tex_1d_v4u32_s32:
   3417   case Intrinsic::nvvm_tex_1d_v4u32_f32:
   3418   case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
   3419   case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
   3420   case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
   3421   case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
   3422   case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
   3423   case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
   3424   case Intrinsic::nvvm_tex_2d_v4u32_s32:
   3425   case Intrinsic::nvvm_tex_2d_v4u32_f32:
   3426   case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
   3427   case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
   3428   case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
   3429   case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
   3430   case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
   3431   case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
   3432   case Intrinsic::nvvm_tex_3d_v4u32_s32:
   3433   case Intrinsic::nvvm_tex_3d_v4u32_f32:
   3434   case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
   3435   case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
   3436   case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
   3437   case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
   3438   case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
   3439   case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
   3440   case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
   3441   case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
   3442   case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
   3443   case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
   3444   case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
   3445   case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
   3446   case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
   3447   case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
   3448   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
   3449   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
   3450   case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
   3451   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
   3452   case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
   3453   case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
   3454   case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
   3455   case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
   3456   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
   3457   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
   3458   case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
   3459   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
   3460   case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
   3461   case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
   3462   case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
   3463   case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
   3464   case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
   3465   case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
   3466   case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
   3467   case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
   3468   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
   3469   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
   3470   case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
   3471   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
   3472   case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
   3473   case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
   3474   case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
   3475   case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
   3476   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
   3477   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
   3478   case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
   3479   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
   3480   case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
   3481   case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
   3482   case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
   3483   case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
   3484   case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
   3485   case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
   3486   case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
   3487   case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
   3488   case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
   3489   case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
   3490   case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
   3491   case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
   3492   case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
   3493   case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
   3494   case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
   3495   case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
   3496   case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
   3497   case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
   3498   case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
   3499   case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: {
   3500     Info.opc = getOpcForTextureInstr(Intrinsic);
   3501     Info.memVT = MVT::v4i32;
   3502     Info.ptrVal = nullptr;
   3503     Info.offset = 0;
   3504     Info.vol = 0;
   3505     Info.readMem = true;
   3506     Info.writeMem = false;
   3507     Info.align = 16;
   3508     return true;
   3509   }
   3510   case Intrinsic::nvvm_suld_1d_i8_clamp:
   3511   case Intrinsic::nvvm_suld_1d_v2i8_clamp:
   3512   case Intrinsic::nvvm_suld_1d_v4i8_clamp:
   3513   case Intrinsic::nvvm_suld_1d_array_i8_clamp:
   3514   case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
   3515   case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
   3516   case Intrinsic::nvvm_suld_2d_i8_clamp:
   3517   case Intrinsic::nvvm_suld_2d_v2i8_clamp:
   3518   case Intrinsic::nvvm_suld_2d_v4i8_clamp:
   3519   case Intrinsic::nvvm_suld_2d_array_i8_clamp:
   3520   case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
   3521   case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
   3522   case Intrinsic::nvvm_suld_3d_i8_clamp:
   3523   case Intrinsic::nvvm_suld_3d_v2i8_clamp:
   3524   case Intrinsic::nvvm_suld_3d_v4i8_clamp:
   3525   case Intrinsic::nvvm_suld_1d_i8_trap:
   3526   case Intrinsic::nvvm_suld_1d_v2i8_trap:
   3527   case Intrinsic::nvvm_suld_1d_v4i8_trap:
   3528   case Intrinsic::nvvm_suld_1d_array_i8_trap:
   3529   case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
   3530   case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
   3531   case Intrinsic::nvvm_suld_2d_i8_trap:
   3532   case Intrinsic::nvvm_suld_2d_v2i8_trap:
   3533   case Intrinsic::nvvm_suld_2d_v4i8_trap:
   3534   case Intrinsic::nvvm_suld_2d_array_i8_trap:
   3535   case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
   3536   case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
   3537   case Intrinsic::nvvm_suld_3d_i8_trap:
   3538   case Intrinsic::nvvm_suld_3d_v2i8_trap:
   3539   case Intrinsic::nvvm_suld_3d_v4i8_trap:
   3540   case Intrinsic::nvvm_suld_1d_i8_zero:
   3541   case Intrinsic::nvvm_suld_1d_v2i8_zero:
   3542   case Intrinsic::nvvm_suld_1d_v4i8_zero:
   3543   case Intrinsic::nvvm_suld_1d_array_i8_zero:
   3544   case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
   3545   case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
   3546   case Intrinsic::nvvm_suld_2d_i8_zero:
   3547   case Intrinsic::nvvm_suld_2d_v2i8_zero:
   3548   case Intrinsic::nvvm_suld_2d_v4i8_zero:
   3549   case Intrinsic::nvvm_suld_2d_array_i8_zero:
   3550   case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
   3551   case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
   3552   case Intrinsic::nvvm_suld_3d_i8_zero:
   3553   case Intrinsic::nvvm_suld_3d_v2i8_zero:
   3554   case Intrinsic::nvvm_suld_3d_v4i8_zero: {
   3555     Info.opc = getOpcForSurfaceInstr(Intrinsic);
   3556     Info.memVT = MVT::i8;
   3557     Info.ptrVal = nullptr;
   3558     Info.offset = 0;
   3559     Info.vol = 0;
   3560     Info.readMem = true;
   3561     Info.writeMem = false;
   3562     Info.align = 16;
   3563     return true;
   3564   }
   3565   case Intrinsic::nvvm_suld_1d_i16_clamp:
   3566   case Intrinsic::nvvm_suld_1d_v2i16_clamp:
   3567   case Intrinsic::nvvm_suld_1d_v4i16_clamp:
   3568   case Intrinsic::nvvm_suld_1d_array_i16_clamp:
   3569   case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
   3570   case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
   3571   case Intrinsic::nvvm_suld_2d_i16_clamp:
   3572   case Intrinsic::nvvm_suld_2d_v2i16_clamp:
   3573   case Intrinsic::nvvm_suld_2d_v4i16_clamp:
   3574   case Intrinsic::nvvm_suld_2d_array_i16_clamp:
   3575   case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
   3576   case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
   3577   case Intrinsic::nvvm_suld_3d_i16_clamp:
   3578   case Intrinsic::nvvm_suld_3d_v2i16_clamp:
   3579   case Intrinsic::nvvm_suld_3d_v4i16_clamp:
   3580   case Intrinsic::nvvm_suld_1d_i16_trap:
   3581   case Intrinsic::nvvm_suld_1d_v2i16_trap:
   3582   case Intrinsic::nvvm_suld_1d_v4i16_trap:
   3583   case Intrinsic::nvvm_suld_1d_array_i16_trap:
   3584   case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
   3585   case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
   3586   case Intrinsic::nvvm_suld_2d_i16_trap:
   3587   case Intrinsic::nvvm_suld_2d_v2i16_trap:
   3588   case Intrinsic::nvvm_suld_2d_v4i16_trap:
   3589   case Intrinsic::nvvm_suld_2d_array_i16_trap:
   3590   case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
   3591   case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
   3592   case Intrinsic::nvvm_suld_3d_i16_trap:
   3593   case Intrinsic::nvvm_suld_3d_v2i16_trap:
   3594   case Intrinsic::nvvm_suld_3d_v4i16_trap:
   3595   case Intrinsic::nvvm_suld_1d_i16_zero:
   3596   case Intrinsic::nvvm_suld_1d_v2i16_zero:
   3597   case Intrinsic::nvvm_suld_1d_v4i16_zero:
   3598   case Intrinsic::nvvm_suld_1d_array_i16_zero:
   3599   case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
   3600   case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
   3601   case Intrinsic::nvvm_suld_2d_i16_zero:
   3602   case Intrinsic::nvvm_suld_2d_v2i16_zero:
   3603   case Intrinsic::nvvm_suld_2d_v4i16_zero:
   3604   case Intrinsic::nvvm_suld_2d_array_i16_zero:
   3605   case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
   3606   case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
   3607   case Intrinsic::nvvm_suld_3d_i16_zero:
   3608   case Intrinsic::nvvm_suld_3d_v2i16_zero:
   3609   case Intrinsic::nvvm_suld_3d_v4i16_zero: {
   3610     Info.opc = getOpcForSurfaceInstr(Intrinsic);
   3611     Info.memVT = MVT::i16;
   3612     Info.ptrVal = nullptr;
   3613     Info.offset = 0;
   3614     Info.vol = 0;
   3615     Info.readMem = true;
   3616     Info.writeMem = false;
   3617     Info.align = 16;
   3618     return true;
   3619   }
   3620   case Intrinsic::nvvm_suld_1d_i32_clamp:
   3621   case Intrinsic::nvvm_suld_1d_v2i32_clamp:
   3622   case Intrinsic::nvvm_suld_1d_v4i32_clamp:
   3623   case Intrinsic::nvvm_suld_1d_array_i32_clamp:
   3624   case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
   3625   case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
   3626   case Intrinsic::nvvm_suld_2d_i32_clamp:
   3627   case Intrinsic::nvvm_suld_2d_v2i32_clamp:
   3628   case Intrinsic::nvvm_suld_2d_v4i32_clamp:
   3629   case Intrinsic::nvvm_suld_2d_array_i32_clamp:
   3630   case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
   3631   case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
   3632   case Intrinsic::nvvm_suld_3d_i32_clamp:
   3633   case Intrinsic::nvvm_suld_3d_v2i32_clamp:
   3634   case Intrinsic::nvvm_suld_3d_v4i32_clamp:
   3635   case Intrinsic::nvvm_suld_1d_i32_trap:
   3636   case Intrinsic::nvvm_suld_1d_v2i32_trap:
   3637   case Intrinsic::nvvm_suld_1d_v4i32_trap:
   3638   case Intrinsic::nvvm_suld_1d_array_i32_trap:
   3639   case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
   3640   case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
   3641   case Intrinsic::nvvm_suld_2d_i32_trap:
   3642   case Intrinsic::nvvm_suld_2d_v2i32_trap:
   3643   case Intrinsic::nvvm_suld_2d_v4i32_trap:
   3644   case Intrinsic::nvvm_suld_2d_array_i32_trap:
   3645   case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
   3646   case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
   3647   case Intrinsic::nvvm_suld_3d_i32_trap:
   3648   case Intrinsic::nvvm_suld_3d_v2i32_trap:
   3649   case Intrinsic::nvvm_suld_3d_v4i32_trap:
   3650   case Intrinsic::nvvm_suld_1d_i32_zero:
   3651   case Intrinsic::nvvm_suld_1d_v2i32_zero:
   3652   case Intrinsic::nvvm_suld_1d_v4i32_zero:
   3653   case Intrinsic::nvvm_suld_1d_array_i32_zero:
   3654   case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
   3655   case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
   3656   case Intrinsic::nvvm_suld_2d_i32_zero:
   3657   case Intrinsic::nvvm_suld_2d_v2i32_zero:
   3658   case Intrinsic::nvvm_suld_2d_v4i32_zero:
   3659   case Intrinsic::nvvm_suld_2d_array_i32_zero:
   3660   case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
   3661   case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
   3662   case Intrinsic::nvvm_suld_3d_i32_zero:
   3663   case Intrinsic::nvvm_suld_3d_v2i32_zero:
   3664   case Intrinsic::nvvm_suld_3d_v4i32_zero: {
   3665     Info.opc = getOpcForSurfaceInstr(Intrinsic);
   3666     Info.memVT = MVT::i32;
   3667     Info.ptrVal = nullptr;
   3668     Info.offset = 0;
   3669     Info.vol = 0;
   3670     Info.readMem = true;
   3671     Info.writeMem = false;
   3672     Info.align = 16;
   3673     return true;
   3674   }
   3675   case Intrinsic::nvvm_suld_1d_i64_clamp:
   3676   case Intrinsic::nvvm_suld_1d_v2i64_clamp:
   3677   case Intrinsic::nvvm_suld_1d_array_i64_clamp:
   3678   case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
   3679   case Intrinsic::nvvm_suld_2d_i64_clamp:
   3680   case Intrinsic::nvvm_suld_2d_v2i64_clamp:
   3681   case Intrinsic::nvvm_suld_2d_array_i64_clamp:
   3682   case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
   3683   case Intrinsic::nvvm_suld_3d_i64_clamp:
   3684   case Intrinsic::nvvm_suld_3d_v2i64_clamp:
   3685   case Intrinsic::nvvm_suld_1d_i64_trap:
   3686   case Intrinsic::nvvm_suld_1d_v2i64_trap:
   3687   case Intrinsic::nvvm_suld_1d_array_i64_trap:
   3688   case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
   3689   case Intrinsic::nvvm_suld_2d_i64_trap:
   3690   case Intrinsic::nvvm_suld_2d_v2i64_trap:
   3691   case Intrinsic::nvvm_suld_2d_array_i64_trap:
   3692   case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
   3693   case Intrinsic::nvvm_suld_3d_i64_trap:
   3694   case Intrinsic::nvvm_suld_3d_v2i64_trap:
   3695   case Intrinsic::nvvm_suld_1d_i64_zero:
   3696   case Intrinsic::nvvm_suld_1d_v2i64_zero:
   3697   case Intrinsic::nvvm_suld_1d_array_i64_zero:
   3698   case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
   3699   case Intrinsic::nvvm_suld_2d_i64_zero:
   3700   case Intrinsic::nvvm_suld_2d_v2i64_zero:
   3701   case Intrinsic::nvvm_suld_2d_array_i64_zero:
   3702   case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
   3703   case Intrinsic::nvvm_suld_3d_i64_zero:
   3704   case Intrinsic::nvvm_suld_3d_v2i64_zero: {
   3705     Info.opc = getOpcForSurfaceInstr(Intrinsic);
   3706     Info.memVT = MVT::i64;
   3707     Info.ptrVal = nullptr;
   3708     Info.offset = 0;
   3709     Info.vol = 0;
   3710     Info.readMem = true;
   3711     Info.writeMem = false;
   3712     Info.align = 16;
   3713     return true;
   3714   }
   3715   }
   3716   return false;
   3717 }
   3718 
   3719 /// isLegalAddressingMode - Return true if the addressing mode represented
   3720 /// by AM is legal for this target, for a load/store of the specified type.
   3721 /// Used to guide target specific optimizations, like loop strength reduction
   3722 /// (LoopStrengthReduce.cpp) and memory optimization for address mode
   3723 /// (CodeGenPrepare.cpp)
   3724 bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
   3725                                                 const AddrMode &AM, Type *Ty,
   3726                                                 unsigned AS) const {
   3727 
   3728   // AddrMode - This represents an addressing mode of:
   3729   //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
   3730   //
   3731   // The legal address modes are
   3732   // - [avar]
   3733   // - [areg]
   3734   // - [areg+immoff]
   3735   // - [immAddr]
   3736 
   3737   if (AM.BaseGV) {
   3738     return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
   3739   }
   3740 
   3741   switch (AM.Scale) {
   3742   case 0: // "r", "r+i" or "i" is allowed
   3743     break;
   3744   case 1:
   3745     if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
   3746       return false;
   3747     // Otherwise we have r+i.
   3748     break;
   3749   default:
   3750     // No scale > 1 is allowed
   3751     return false;
   3752   }
   3753   return true;
   3754 }
   3755 
   3756 //===----------------------------------------------------------------------===//
   3757 //                         NVPTX Inline Assembly Support
   3758 //===----------------------------------------------------------------------===//
   3759 
   3760 /// getConstraintType - Given a constraint letter, return the type of
   3761 /// constraint it is for this target.
   3762 NVPTXTargetLowering::ConstraintType
   3763 NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
   3764   if (Constraint.size() == 1) {
   3765     switch (Constraint[0]) {
   3766     default:
   3767       break;
   3768     case 'b':
   3769     case 'r':
   3770     case 'h':
   3771     case 'c':
   3772     case 'l':
   3773     case 'f':
   3774     case 'd':
   3775     case '0':
   3776     case 'N':
   3777       return C_RegisterClass;
   3778     }
   3779   }
   3780   return TargetLowering::getConstraintType(Constraint);
   3781 }
   3782 
   3783 std::pair<unsigned, const TargetRegisterClass *>
   3784 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   3785                                                   StringRef Constraint,
   3786                                                   MVT VT) const {
   3787   if (Constraint.size() == 1) {
   3788     switch (Constraint[0]) {
   3789     case 'b':
   3790       return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
   3791     case 'c':
   3792       return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
   3793     case 'h':
   3794       return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
   3795     case 'r':
   3796       return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
   3797     case 'l':
   3798     case 'N':
   3799       return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
   3800     case 'f':
   3801       return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
   3802     case 'd':
   3803       return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
   3804     }
   3805   }
   3806   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
   3807 }
   3808 
   3809 //===----------------------------------------------------------------------===//
   3810 //                         NVPTX DAG Combining
   3811 //===----------------------------------------------------------------------===//
   3812 
   3813 bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
   3814                                    CodeGenOpt::Level OptLevel) const {
   3815   const Function *F = MF.getFunction();
   3816   const TargetOptions &TO = MF.getTarget().Options;
   3817 
   3818   // Always honor command-line argument
   3819   if (FMAContractLevelOpt.getNumOccurrences() > 0) {
   3820     return FMAContractLevelOpt > 0;
   3821   } else if (OptLevel == 0) {
   3822     // Do not contract if we're not optimizing the code
   3823     return false;
   3824   } else if (TO.AllowFPOpFusion == FPOpFusion::Fast || TO.UnsafeFPMath) {
   3825     // Honor TargetOptions flags that explicitly say fusion is okay
   3826     return true;
   3827   } else if (F->hasFnAttribute("unsafe-fp-math")) {
   3828     // Check for unsafe-fp-math=true coming from Clang
   3829     Attribute Attr = F->getFnAttribute("unsafe-fp-math");
   3830     StringRef Val = Attr.getValueAsString();
   3831     if (Val == "true")
   3832       return true;
   3833   }
   3834 
   3835   // We did not have a clear indication that fusion is allowed, so assume not
   3836   return false;
   3837 }
   3838 
   3839 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
   3840 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
   3841 /// called with the default operands, and if that fails, with commuted
   3842 /// operands.
   3843 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
   3844                                            TargetLowering::DAGCombinerInfo &DCI,
   3845                                              const NVPTXSubtarget &Subtarget,
   3846                                              CodeGenOpt::Level OptLevel) {
   3847   SelectionDAG  &DAG = DCI.DAG;
   3848   // Skip non-integer, non-scalar case
   3849   EVT VT=N0.getValueType();
   3850   if (VT.isVector())
   3851     return SDValue();
   3852 
   3853   // fold (add (mul a, b), c) -> (mad a, b, c)
   3854   //
   3855   if (N0.getOpcode() == ISD::MUL) {
   3856     assert (VT.isInteger());
   3857     // For integer:
   3858     // Since integer multiply-add costs the same as integer multiply
   3859     // but is more costly than integer add, do the fusion only when
   3860     // the mul is only used in the add.
   3861     if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
   3862         !N0.getNode()->hasOneUse())
   3863       return SDValue();
   3864 
   3865     // Do the folding
   3866     return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
   3867                        N0.getOperand(0), N0.getOperand(1), N1);
   3868   }
   3869   else if (N0.getOpcode() == ISD::FMUL) {
   3870     if (VT == MVT::f32 || VT == MVT::f64) {
   3871       const auto *TLI = static_cast<const NVPTXTargetLowering *>(
   3872           &DAG.getTargetLoweringInfo());
   3873       if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
   3874         return SDValue();
   3875 
   3876       // For floating point:
   3877       // Do the fusion only when the mul has less than 5 uses and all
   3878       // are add.
   3879       // The heuristic is that if a use is not an add, then that use
   3880       // cannot be fused into fma, therefore mul is still needed anyway.
   3881       // If there are more than 4 uses, even if they are all add, fusing
   3882       // them will increase register pressue.
   3883       //
   3884       int numUses = 0;
   3885       int nonAddCount = 0;
   3886       for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
   3887            UE = N0.getNode()->use_end();
   3888            UI != UE; ++UI) {
   3889         numUses++;
   3890         SDNode *User = *UI;
   3891         if (User->getOpcode() != ISD::FADD)
   3892           ++nonAddCount;
   3893       }
   3894       if (numUses >= 5)
   3895         return SDValue();
   3896       if (nonAddCount) {
   3897         int orderNo = N->getIROrder();
   3898         int orderNo2 = N0.getNode()->getIROrder();
   3899         // simple heuristics here for considering potential register
   3900         // pressure, the logics here is that the differnce are used
   3901         // to measure the distance between def and use, the longer distance
   3902         // more likely cause register pressure.
   3903         if (orderNo - orderNo2 < 500)
   3904           return SDValue();
   3905 
   3906         // Now, check if at least one of the FMUL's operands is live beyond the node N,
   3907         // which guarantees that the FMA will not increase register pressure at node N.
   3908         bool opIsLive = false;
   3909         const SDNode *left = N0.getOperand(0).getNode();
   3910         const SDNode *right = N0.getOperand(1).getNode();
   3911 
   3912         if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
   3913           opIsLive = true;
   3914 
   3915         if (!opIsLive)
   3916           for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
   3917             SDNode *User = *UI;
   3918             int orderNo3 = User->getIROrder();
   3919             if (orderNo3 > orderNo) {
   3920               opIsLive = true;
   3921               break;
   3922             }
   3923           }
   3924 
   3925         if (!opIsLive)
   3926           for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
   3927             SDNode *User = *UI;
   3928             int orderNo3 = User->getIROrder();
   3929             if (orderNo3 > orderNo) {
   3930               opIsLive = true;
   3931               break;
   3932             }
   3933           }
   3934 
   3935         if (!opIsLive)
   3936           return SDValue();
   3937       }
   3938 
   3939       return DAG.getNode(ISD::FMA, SDLoc(N), VT,
   3940                          N0.getOperand(0), N0.getOperand(1), N1);
   3941     }
   3942   }
   3943 
   3944   return SDValue();
   3945 }
   3946 
   3947 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
   3948 ///
   3949 static SDValue PerformADDCombine(SDNode *N,
   3950                                  TargetLowering::DAGCombinerInfo &DCI,
   3951                                  const NVPTXSubtarget &Subtarget,
   3952                                  CodeGenOpt::Level OptLevel) {
   3953   SDValue N0 = N->getOperand(0);
   3954   SDValue N1 = N->getOperand(1);
   3955 
   3956   // First try with the default operand order.
   3957   if (SDValue Result =
   3958           PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
   3959     return Result;
   3960 
   3961   // If that didn't work, try again with the operands commuted.
   3962   return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
   3963 }
   3964 
   3965 static SDValue PerformANDCombine(SDNode *N,
   3966                                  TargetLowering::DAGCombinerInfo &DCI) {
   3967   // The type legalizer turns a vector load of i8 values into a zextload to i16
   3968   // registers, optionally ANY_EXTENDs it (if target type is integer),
   3969   // and ANDs off the high 8 bits. Since we turn this load into a
   3970   // target-specific DAG node, the DAG combiner fails to eliminate these AND
   3971   // nodes. Do that here.
   3972   SDValue Val = N->getOperand(0);
   3973   SDValue Mask = N->getOperand(1);
   3974 
   3975   if (isa<ConstantSDNode>(Val)) {
   3976     std::swap(Val, Mask);
   3977   }
   3978 
   3979   SDValue AExt;
   3980   // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
   3981   if (Val.getOpcode() == ISD::ANY_EXTEND) {
   3982     AExt = Val;
   3983     Val = Val->getOperand(0);
   3984   }
   3985 
   3986   if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
   3987     Val = Val->getOperand(0);
   3988   }
   3989 
   3990   if (Val->getOpcode() == NVPTXISD::LoadV2 ||
   3991       Val->getOpcode() == NVPTXISD::LoadV4) {
   3992     ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
   3993     if (!MaskCnst) {
   3994       // Not an AND with a constant
   3995       return SDValue();
   3996     }
   3997 
   3998     uint64_t MaskVal = MaskCnst->getZExtValue();
   3999     if (MaskVal != 0xff) {
   4000       // Not an AND that chops off top 8 bits
   4001       return SDValue();
   4002     }
   4003 
   4004     MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
   4005     if (!Mem) {
   4006       // Not a MemSDNode?!?
   4007       return SDValue();
   4008     }
   4009 
   4010     EVT MemVT = Mem->getMemoryVT();
   4011     if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
   4012       // We only handle the i8 case
   4013       return SDValue();
   4014     }
   4015 
   4016     unsigned ExtType =
   4017       cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
   4018         getZExtValue();
   4019     if (ExtType == ISD::SEXTLOAD) {
   4020       // If for some reason the load is a sextload, the and is needed to zero
   4021       // out the high 8 bits
   4022       return SDValue();
   4023     }
   4024 
   4025     bool AddTo = false;
   4026     if (AExt.getNode() != 0) {
   4027       // Re-insert the ext as a zext.
   4028       Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
   4029                             AExt.getValueType(), Val);
   4030       AddTo = true;
   4031     }
   4032 
   4033     // If we get here, the AND is unnecessary.  Just replace it with the load
   4034     DCI.CombineTo(N, Val, AddTo);
   4035   }
   4036 
   4037   return SDValue();
   4038 }
   4039 
   4040 static SDValue PerformSELECTCombine(SDNode *N,
   4041                                     TargetLowering::DAGCombinerInfo &DCI) {
   4042   // Currently this detects patterns for integer min and max and
   4043   // lowers them to PTX-specific intrinsics that enable hardware
   4044   // support.
   4045 
   4046   const SDValue Cond = N->getOperand(0);
   4047   if (Cond.getOpcode() != ISD::SETCC) return SDValue();
   4048 
   4049   const SDValue LHS = Cond.getOperand(0);
   4050   const SDValue RHS = Cond.getOperand(1);
   4051   const SDValue True = N->getOperand(1);
   4052   const SDValue False = N->getOperand(2);
   4053   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
   4054     return SDValue();
   4055 
   4056   const EVT VT = N->getValueType(0);
   4057   if (VT != MVT::i32 && VT != MVT::i64) return SDValue();
   4058 
   4059   const ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   4060   SDValue Larger;  // The larger of LHS and RHS when condition is true.
   4061   switch (CC) {
   4062     case ISD::SETULT:
   4063     case ISD::SETULE:
   4064     case ISD::SETLT:
   4065     case ISD::SETLE:
   4066       Larger = RHS;
   4067       break;
   4068 
   4069     case ISD::SETGT:
   4070     case ISD::SETGE:
   4071     case ISD::SETUGT:
   4072     case ISD::SETUGE:
   4073       Larger = LHS;
   4074       break;
   4075 
   4076     default:
   4077       return SDValue();
   4078   }
   4079   const bool IsMax = (Larger == True);
   4080   const bool IsSigned = ISD::isSignedIntSetCC(CC);
   4081 
   4082   unsigned IntrinsicId;
   4083   if (VT == MVT::i32) {
   4084     if (IsSigned)
   4085       IntrinsicId = IsMax ? Intrinsic::nvvm_max_i : Intrinsic::nvvm_min_i;
   4086     else
   4087       IntrinsicId = IsMax ? Intrinsic::nvvm_max_ui : Intrinsic::nvvm_min_ui;
   4088   } else {
   4089     assert(VT == MVT::i64);
   4090     if (IsSigned)
   4091       IntrinsicId = IsMax ? Intrinsic::nvvm_max_ll : Intrinsic::nvvm_min_ll;
   4092     else
   4093       IntrinsicId = IsMax ? Intrinsic::nvvm_max_ull : Intrinsic::nvvm_min_ull;
   4094   }
   4095 
   4096   SDLoc DL(N);
   4097   return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
   4098                          DCI.DAG.getConstant(IntrinsicId, DL, VT), LHS, RHS);
   4099 }
   4100 
   4101 enum OperandSignedness {
   4102   Signed = 0,
   4103   Unsigned,
   4104   Unknown
   4105 };
   4106 
   4107 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
   4108 /// that can be demoted to \p OptSize bits without loss of information. The
   4109 /// signedness of the operand, if determinable, is placed in \p S.
   4110 static bool IsMulWideOperandDemotable(SDValue Op,
   4111                                       unsigned OptSize,
   4112                                       OperandSignedness &S) {
   4113   S = Unknown;
   4114 
   4115   if (Op.getOpcode() == ISD::SIGN_EXTEND ||
   4116       Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
   4117     EVT OrigVT = Op.getOperand(0).getValueType();
   4118     if (OrigVT.getSizeInBits() <= OptSize) {
   4119       S = Signed;
   4120       return true;
   4121     }
   4122   } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
   4123     EVT OrigVT = Op.getOperand(0).getValueType();
   4124     if (OrigVT.getSizeInBits() <= OptSize) {
   4125       S = Unsigned;
   4126       return true;
   4127     }
   4128   }
   4129 
   4130   return false;
   4131 }
   4132 
   4133 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
   4134 /// be demoted to \p OptSize bits without loss of information. If the operands
   4135 /// contain a constant, it should appear as the RHS operand. The signedness of
   4136 /// the operands is placed in \p IsSigned.
   4137 static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
   4138                                         unsigned OptSize,
   4139                                         bool &IsSigned) {
   4140 
   4141   OperandSignedness LHSSign;
   4142 
   4143   // The LHS operand must be a demotable op
   4144   if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
   4145     return false;
   4146 
   4147   // We should have been able to determine the signedness from the LHS
   4148   if (LHSSign == Unknown)
   4149     return false;
   4150 
   4151   IsSigned = (LHSSign == Signed);
   4152 
   4153   // The RHS can be a demotable op or a constant
   4154   if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
   4155     const APInt &Val = CI->getAPIntValue();
   4156     if (LHSSign == Unsigned) {
   4157       return Val.isIntN(OptSize);
   4158     } else {
   4159       return Val.isSignedIntN(OptSize);
   4160     }
   4161   } else {
   4162     OperandSignedness RHSSign;
   4163     if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
   4164       return false;
   4165 
   4166     return LHSSign == RHSSign;
   4167   }
   4168 }
   4169 
   4170 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
   4171 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
   4172 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
   4173 /// amount.
   4174 static SDValue TryMULWIDECombine(SDNode *N,
   4175                                  TargetLowering::DAGCombinerInfo &DCI) {
   4176   EVT MulType = N->getValueType(0);
   4177   if (MulType != MVT::i32 && MulType != MVT::i64) {
   4178     return SDValue();
   4179   }
   4180 
   4181   SDLoc DL(N);
   4182   unsigned OptSize = MulType.getSizeInBits() >> 1;
   4183   SDValue LHS = N->getOperand(0);
   4184   SDValue RHS = N->getOperand(1);
   4185 
   4186   // Canonicalize the multiply so the constant (if any) is on the right
   4187   if (N->getOpcode() == ISD::MUL) {
   4188     if (isa<ConstantSDNode>(LHS)) {
   4189       std::swap(LHS, RHS);
   4190     }
   4191   }
   4192 
   4193   // If we have a SHL, determine the actual multiply amount
   4194   if (N->getOpcode() == ISD::SHL) {
   4195     ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
   4196     if (!ShlRHS) {
   4197       return SDValue();
   4198     }
   4199 
   4200     APInt ShiftAmt = ShlRHS->getAPIntValue();
   4201     unsigned BitWidth = MulType.getSizeInBits();
   4202     if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
   4203       APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
   4204       RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
   4205     } else {
   4206       return SDValue();
   4207     }
   4208   }
   4209 
   4210   bool Signed;
   4211   // Verify that our operands are demotable
   4212   if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
   4213     return SDValue();
   4214   }
   4215 
   4216   EVT DemotedVT;
   4217   if (MulType == MVT::i32) {
   4218     DemotedVT = MVT::i16;
   4219   } else {
   4220     DemotedVT = MVT::i32;
   4221   }
   4222 
   4223   // Truncate the operands to the correct size. Note that these are just for
   4224   // type consistency and will (likely) be eliminated in later phases.
   4225   SDValue TruncLHS =
   4226     DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
   4227   SDValue TruncRHS =
   4228     DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
   4229 
   4230   unsigned Opc;
   4231   if (Signed) {
   4232     Opc = NVPTXISD::MUL_WIDE_SIGNED;
   4233   } else {
   4234     Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
   4235   }
   4236 
   4237   return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
   4238 }
   4239 
   4240 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
   4241 static SDValue PerformMULCombine(SDNode *N,
   4242                                  TargetLowering::DAGCombinerInfo &DCI,
   4243                                  CodeGenOpt::Level OptLevel) {
   4244   if (OptLevel > 0) {
   4245     // Try mul.wide combining at OptLevel > 0
   4246     if (SDValue Ret = TryMULWIDECombine(N, DCI))
   4247       return Ret;
   4248   }
   4249 
   4250   return SDValue();
   4251 }
   4252 
   4253 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
   4254 static SDValue PerformSHLCombine(SDNode *N,
   4255                                  TargetLowering::DAGCombinerInfo &DCI,
   4256                                  CodeGenOpt::Level OptLevel) {
   4257   if (OptLevel > 0) {
   4258     // Try mul.wide combining at OptLevel > 0
   4259     if (SDValue Ret = TryMULWIDECombine(N, DCI))
   4260       return Ret;
   4261   }
   4262 
   4263   return SDValue();
   4264 }
   4265 
   4266 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
   4267                                                DAGCombinerInfo &DCI) const {
   4268   CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel();
   4269   switch (N->getOpcode()) {
   4270     default: break;
   4271     case ISD::ADD:
   4272     case ISD::FADD:
   4273       return PerformADDCombine(N, DCI, STI, OptLevel);
   4274     case ISD::MUL:
   4275       return PerformMULCombine(N, DCI, OptLevel);
   4276     case ISD::SHL:
   4277       return PerformSHLCombine(N, DCI, OptLevel);
   4278     case ISD::AND:
   4279       return PerformANDCombine(N, DCI);
   4280     case ISD::SELECT:
   4281       return PerformSELECTCombine(N, DCI);
   4282   }
   4283   return SDValue();
   4284 }
   4285 
   4286 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
   4287 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
   4288                               SmallVectorImpl<SDValue> &Results) {
   4289   EVT ResVT = N->getValueType(0);
   4290   SDLoc DL(N);
   4291 
   4292   assert(ResVT.isVector() && "Vector load must have vector type");
   4293 
   4294   // We only handle "native" vector sizes for now, e.g. <4 x double> is not
   4295   // legal.  We can (and should) split that into 2 loads of <2 x double> here
   4296   // but I'm leaving that as a TODO for now.
   4297   assert(ResVT.isSimple() && "Can only handle simple types");
   4298   switch (ResVT.getSimpleVT().SimpleTy) {
   4299   default:
   4300     return;
   4301   case MVT::v2i8:
   4302   case MVT::v2i16:
   4303   case MVT::v2i32:
   4304   case MVT::v2i64:
   4305   case MVT::v2f32:
   4306   case MVT::v2f64:
   4307   case MVT::v4i8:
   4308   case MVT::v4i16:
   4309   case MVT::v4i32:
   4310   case MVT::v4f32:
   4311     // This is a "native" vector type
   4312     break;
   4313   }
   4314 
   4315   LoadSDNode *LD = cast<LoadSDNode>(N);
   4316 
   4317   unsigned Align = LD->getAlignment();
   4318   auto &TD = DAG.getDataLayout();
   4319   unsigned PrefAlign =
   4320       TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
   4321   if (Align < PrefAlign) {
   4322     // This load is not sufficiently aligned, so bail out and let this vector
   4323     // load be scalarized.  Note that we may still be able to emit smaller
   4324     // vector loads.  For example, if we are loading a <4 x float> with an
   4325     // alignment of 8, this check will fail but the legalizer will try again
   4326     // with 2 x <2 x float>, which will succeed with an alignment of 8.
   4327     return;
   4328   }
   4329 
   4330   EVT EltVT = ResVT.getVectorElementType();
   4331   unsigned NumElts = ResVT.getVectorNumElements();
   4332 
   4333   // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
   4334   // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
   4335   // loaded type to i16 and propagate the "real" type as the memory type.
   4336   bool NeedTrunc = false;
   4337   if (EltVT.getSizeInBits() < 16) {
   4338     EltVT = MVT::i16;
   4339     NeedTrunc = true;
   4340   }
   4341 
   4342   unsigned Opcode = 0;
   4343   SDVTList LdResVTs;
   4344 
   4345   switch (NumElts) {
   4346   default:
   4347     return;
   4348   case 2:
   4349     Opcode = NVPTXISD::LoadV2;
   4350     LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
   4351     break;
   4352   case 4: {
   4353     Opcode = NVPTXISD::LoadV4;
   4354     EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
   4355     LdResVTs = DAG.getVTList(ListVTs);
   4356     break;
   4357   }
   4358   }
   4359 
   4360   // Copy regular operands
   4361   SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
   4362 
   4363   // The select routine does not have access to the LoadSDNode instance, so
   4364   // pass along the extension information
   4365   OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
   4366 
   4367   SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
   4368                                           LD->getMemoryVT(),
   4369                                           LD->getMemOperand());
   4370 
   4371   SmallVector<SDValue, 4> ScalarRes;
   4372 
   4373   for (unsigned i = 0; i < NumElts; ++i) {
   4374     SDValue Res = NewLD.getValue(i);
   4375     if (NeedTrunc)
   4376       Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
   4377     ScalarRes.push_back(Res);
   4378   }
   4379 
   4380   SDValue LoadChain = NewLD.getValue(NumElts);
   4381 
   4382   SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
   4383 
   4384   Results.push_back(BuildVec);
   4385   Results.push_back(LoadChain);
   4386 }
   4387 
   4388 static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
   4389                                      SmallVectorImpl<SDValue> &Results) {
   4390   SDValue Chain = N->getOperand(0);
   4391   SDValue Intrin = N->getOperand(1);
   4392   SDLoc DL(N);
   4393 
   4394   // Get the intrinsic ID
   4395   unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
   4396   switch (IntrinNo) {
   4397   default:
   4398     return;
   4399   case Intrinsic::nvvm_ldg_global_i:
   4400   case Intrinsic::nvvm_ldg_global_f:
   4401   case Intrinsic::nvvm_ldg_global_p:
   4402   case Intrinsic::nvvm_ldu_global_i:
   4403   case Intrinsic::nvvm_ldu_global_f:
   4404   case Intrinsic::nvvm_ldu_global_p: {
   4405     EVT ResVT = N->getValueType(0);
   4406 
   4407     if (ResVT.isVector()) {
   4408       // Vector LDG/LDU
   4409 
   4410       unsigned NumElts = ResVT.getVectorNumElements();
   4411       EVT EltVT = ResVT.getVectorElementType();
   4412 
   4413       // Since LDU/LDG are target nodes, we cannot rely on DAG type
   4414       // legalization.
   4415       // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
   4416       // loaded type to i16 and propagate the "real" type as the memory type.
   4417       bool NeedTrunc = false;
   4418       if (EltVT.getSizeInBits() < 16) {
   4419         EltVT = MVT::i16;
   4420         NeedTrunc = true;
   4421       }
   4422 
   4423       unsigned Opcode = 0;
   4424       SDVTList LdResVTs;
   4425 
   4426       switch (NumElts) {
   4427       default:
   4428         return;
   4429       case 2:
   4430         switch (IntrinNo) {
   4431         default:
   4432           return;
   4433         case Intrinsic::nvvm_ldg_global_i:
   4434         case Intrinsic::nvvm_ldg_global_f:
   4435         case Intrinsic::nvvm_ldg_global_p:
   4436           Opcode = NVPTXISD::LDGV2;
   4437           break;
   4438         case Intrinsic::nvvm_ldu_global_i:
   4439         case Intrinsic::nvvm_ldu_global_f:
   4440         case Intrinsic::nvvm_ldu_global_p:
   4441           Opcode = NVPTXISD::LDUV2;
   4442           break;
   4443         }
   4444         LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
   4445         break;
   4446       case 4: {
   4447         switch (IntrinNo) {
   4448         default:
   4449           return;
   4450         case Intrinsic::nvvm_ldg_global_i:
   4451         case Intrinsic::nvvm_ldg_global_f:
   4452         case Intrinsic::nvvm_ldg_global_p:
   4453           Opcode = NVPTXISD::LDGV4;
   4454           break;
   4455         case Intrinsic::nvvm_ldu_global_i:
   4456         case Intrinsic::nvvm_ldu_global_f:
   4457         case Intrinsic::nvvm_ldu_global_p:
   4458           Opcode = NVPTXISD::LDUV4;
   4459           break;
   4460         }
   4461         EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
   4462         LdResVTs = DAG.getVTList(ListVTs);
   4463         break;
   4464       }
   4465       }
   4466 
   4467       SmallVector<SDValue, 8> OtherOps;
   4468 
   4469       // Copy regular operands
   4470 
   4471       OtherOps.push_back(Chain); // Chain
   4472                                  // Skip operand 1 (intrinsic ID)
   4473       // Others
   4474       OtherOps.append(N->op_begin() + 2, N->op_end());
   4475 
   4476       MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
   4477 
   4478       SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
   4479                                               MemSD->getMemoryVT(),
   4480                                               MemSD->getMemOperand());
   4481 
   4482       SmallVector<SDValue, 4> ScalarRes;
   4483 
   4484       for (unsigned i = 0; i < NumElts; ++i) {
   4485         SDValue Res = NewLD.getValue(i);
   4486         if (NeedTrunc)
   4487           Res =
   4488               DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
   4489         ScalarRes.push_back(Res);
   4490       }
   4491 
   4492       SDValue LoadChain = NewLD.getValue(NumElts);
   4493 
   4494       SDValue BuildVec =
   4495           DAG.getBuildVector(ResVT, DL, ScalarRes);
   4496 
   4497       Results.push_back(BuildVec);
   4498       Results.push_back(LoadChain);
   4499     } else {
   4500       // i8 LDG/LDU
   4501       assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
   4502              "Custom handling of non-i8 ldu/ldg?");
   4503 
   4504       // Just copy all operands as-is
   4505       SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
   4506 
   4507       // Force output to i16
   4508       SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
   4509 
   4510       MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
   4511 
   4512       // We make sure the memory type is i8, which will be used during isel
   4513       // to select the proper instruction.
   4514       SDValue NewLD =
   4515           DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
   4516                                   MVT::i8, MemSD->getMemOperand());
   4517 
   4518       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
   4519                                     NewLD.getValue(0)));
   4520       Results.push_back(NewLD.getValue(1));
   4521     }
   4522   }
   4523   }
   4524 }
   4525 
   4526 void NVPTXTargetLowering::ReplaceNodeResults(
   4527     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
   4528   switch (N->getOpcode()) {
   4529   default:
   4530     report_fatal_error("Unhandled custom legalization");
   4531   case ISD::LOAD:
   4532     ReplaceLoadVector(N, DAG, Results);
   4533     return;
   4534   case ISD::INTRINSIC_W_CHAIN:
   4535     ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
   4536     return;
   4537   }
   4538 }
   4539 
   4540 // Pin NVPTXSection's and NVPTXTargetObjectFile's vtables to this file.
   4541 void NVPTXSection::anchor() {}
   4542 
   4543 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {
   4544   delete static_cast<NVPTXSection *>(TextSection);
   4545   delete static_cast<NVPTXSection *>(DataSection);
   4546   delete static_cast<NVPTXSection *>(BSSSection);
   4547   delete static_cast<NVPTXSection *>(ReadOnlySection);
   4548 
   4549   delete static_cast<NVPTXSection *>(StaticCtorSection);
   4550   delete static_cast<NVPTXSection *>(StaticDtorSection);
   4551   delete static_cast<NVPTXSection *>(LSDASection);
   4552   delete static_cast<NVPTXSection *>(EHFrameSection);
   4553   delete static_cast<NVPTXSection *>(DwarfAbbrevSection);
   4554   delete static_cast<NVPTXSection *>(DwarfInfoSection);
   4555   delete static_cast<NVPTXSection *>(DwarfLineSection);
   4556   delete static_cast<NVPTXSection *>(DwarfFrameSection);
   4557   delete static_cast<NVPTXSection *>(DwarfPubTypesSection);
   4558   delete static_cast<const NVPTXSection *>(DwarfDebugInlineSection);
   4559   delete static_cast<NVPTXSection *>(DwarfStrSection);
   4560   delete static_cast<NVPTXSection *>(DwarfLocSection);
   4561   delete static_cast<NVPTXSection *>(DwarfARangesSection);
   4562   delete static_cast<NVPTXSection *>(DwarfRangesSection);
   4563   delete static_cast<NVPTXSection *>(DwarfMacinfoSection);
   4564 }
   4565 
   4566 MCSection *
   4567 NVPTXTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
   4568                                               SectionKind Kind, Mangler &Mang,
   4569                                               const TargetMachine &TM) const {
   4570   return getDataSection();
   4571 }
   4572