Home | History | Annotate | Download | only in NVPTX
      1 //===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file describes the PTX instructions in TableGen format.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 include "NVPTXInstrFormats.td"
     15 
     16 // A NOP instruction
     17 let hasSideEffects = 0 in {
     18   def NOP : NVPTXInst<(outs), (ins), "", []>;
     19 }
     20 
     21 let OperandType = "OPERAND_IMMEDIATE" in {
     22   def f16imm : Operand<f16>;
     23 }
     24 
     25 // List of vector specific properties
     26 def isVecLD      : VecInstTypeEnum<1>;
     27 def isVecST      : VecInstTypeEnum<2>;
     28 def isVecBuild   : VecInstTypeEnum<3>;
     29 def isVecShuffle : VecInstTypeEnum<4>;
     30 def isVecExtract : VecInstTypeEnum<5>;
     31 def isVecInsert  : VecInstTypeEnum<6>;
     32 def isVecDest    : VecInstTypeEnum<7>;
     33 def isVecOther   : VecInstTypeEnum<15>;
     34 
     35 //===----------------------------------------------------------------------===//
     36 // NVPTX Operand Definitions.
     37 //===----------------------------------------------------------------------===//
     38 
     39 def brtarget    : Operand<OtherVT>;
     40 
     41 // CVT conversion modes
     42 // These must match the enum in NVPTX.h
     43 def CvtNONE : PatLeaf<(i32 0x0)>;
     44 def CvtRNI  : PatLeaf<(i32 0x1)>;
     45 def CvtRZI  : PatLeaf<(i32 0x2)>;
     46 def CvtRMI  : PatLeaf<(i32 0x3)>;
     47 def CvtRPI  : PatLeaf<(i32 0x4)>;
     48 def CvtRN   : PatLeaf<(i32 0x5)>;
     49 def CvtRZ   : PatLeaf<(i32 0x6)>;
     50 def CvtRM   : PatLeaf<(i32 0x7)>;
     51 def CvtRP   : PatLeaf<(i32 0x8)>;
     52 
     53 def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
     54 def CvtRNI_FTZ  : PatLeaf<(i32 0x11)>;
     55 def CvtRZI_FTZ  : PatLeaf<(i32 0x12)>;
     56 def CvtRMI_FTZ  : PatLeaf<(i32 0x13)>;
     57 def CvtRPI_FTZ  : PatLeaf<(i32 0x14)>;
     58 def CvtRN_FTZ   : PatLeaf<(i32 0x15)>;
     59 def CvtRZ_FTZ   : PatLeaf<(i32 0x16)>;
     60 def CvtRM_FTZ   : PatLeaf<(i32 0x17)>;
     61 def CvtRP_FTZ   : PatLeaf<(i32 0x18)>;
     62 
     63 def CvtSAT      : PatLeaf<(i32 0x20)>;
     64 def CvtSAT_FTZ  : PatLeaf<(i32 0x30)>;
     65 
     66 def CvtMode : Operand<i32> {
     67   let PrintMethod = "printCvtMode";
     68 }
     69 
     70 // Compare modes
     71 // These must match the enum in NVPTX.h
     72 def CmpEQ   : PatLeaf<(i32 0)>;
     73 def CmpNE   : PatLeaf<(i32 1)>;
     74 def CmpLT   : PatLeaf<(i32 2)>;
     75 def CmpLE   : PatLeaf<(i32 3)>;
     76 def CmpGT   : PatLeaf<(i32 4)>;
     77 def CmpGE   : PatLeaf<(i32 5)>;
     78 def CmpEQU  : PatLeaf<(i32 10)>;
     79 def CmpNEU  : PatLeaf<(i32 11)>;
     80 def CmpLTU  : PatLeaf<(i32 12)>;
     81 def CmpLEU  : PatLeaf<(i32 13)>;
     82 def CmpGTU  : PatLeaf<(i32 14)>;
     83 def CmpGEU  : PatLeaf<(i32 15)>;
     84 def CmpNUM  : PatLeaf<(i32 16)>;
     85 def CmpNAN  : PatLeaf<(i32 17)>;
     86 
     87 def CmpEQ_FTZ   : PatLeaf<(i32 0x100)>;
     88 def CmpNE_FTZ   : PatLeaf<(i32 0x101)>;
     89 def CmpLT_FTZ   : PatLeaf<(i32 0x102)>;
     90 def CmpLE_FTZ   : PatLeaf<(i32 0x103)>;
     91 def CmpGT_FTZ   : PatLeaf<(i32 0x104)>;
     92 def CmpGE_FTZ   : PatLeaf<(i32 0x105)>;
     93 def CmpEQU_FTZ  : PatLeaf<(i32 0x10A)>;
     94 def CmpNEU_FTZ  : PatLeaf<(i32 0x10B)>;
     95 def CmpLTU_FTZ  : PatLeaf<(i32 0x10C)>;
     96 def CmpLEU_FTZ  : PatLeaf<(i32 0x10D)>;
     97 def CmpGTU_FTZ  : PatLeaf<(i32 0x10E)>;
     98 def CmpGEU_FTZ  : PatLeaf<(i32 0x10F)>;
     99 def CmpNUM_FTZ  : PatLeaf<(i32 0x110)>;
    100 def CmpNAN_FTZ  : PatLeaf<(i32 0x111)>;
    101 
    102 def CmpMode : Operand<i32> {
    103   let PrintMethod = "printCmpMode";
    104 }
    105 def VecElement : Operand<i32> {
    106   let PrintMethod = "printVecElement";
    107 }
    108 
    109 //===----------------------------------------------------------------------===//
    110 // NVPTX Instruction Predicate Definitions
    111 //===----------------------------------------------------------------------===//
    112 
    113 
    114 def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
    115 def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
    116 def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
    117 def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
    118 def hasVote : Predicate<"Subtarget->hasVote()">;
    119 def hasDouble : Predicate<"Subtarget->hasDouble()">;
    120 def hasLDG : Predicate<"Subtarget->hasLDG()">;
    121 def hasLDU : Predicate<"Subtarget->hasLDU()">;
    122 
    123 def doF32FTZ : Predicate<"useF32FTZ()">;
    124 def doNoF32FTZ : Predicate<"!useF32FTZ()">;
    125 
    126 def doMulWide      : Predicate<"doMulWide">;
    127 
    128 def allowFMA : Predicate<"allowFMA()">;
    129 def noFMA : Predicate<"!allowFMA()">;
    130 def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">;
    131 
    132 def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
    133 def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
    134 
    135 def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
    136 def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
    137 
    138 def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
    139 def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
    140 
    141 def true : Predicate<"true">;
    142 
    143 def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
    144 def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
    145 def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">;
    146 
    147 def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
    148 def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
    149 
    150 def useShortPtr : Predicate<"useShortPointers()">;
    151 def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
    152 
    153 //===----------------------------------------------------------------------===//
    154 // Some Common Instruction Class Templates
    155 //===----------------------------------------------------------------------===//
    156 
    157 // Template for instructions which take three int64, int32, or int16 args.
    158 // The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
    159 multiclass I3<string OpcStr, SDNode OpNode> {
    160   def i64rr :
    161     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
    162               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
    163               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
    164   def i64ri :
    165     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
    166               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
    167               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
    168   def i32rr :
    169     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
    170               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
    171               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
    172   def i32ri :
    173     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
    174               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
    175               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
    176   def i16rr :
    177     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
    178               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
    179               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
    180   def i16ri :
    181     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
    182               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
    183               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
    184 }
    185 
    186 // Template for instructions which take 3 int32 args.  The instructions are
    187 // named "<OpcStr>.s32" (e.g. "addc.cc.s32").
    188 multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> {
    189    def i32rr :
    190      NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
    191                !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
    192                [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
    193    def i32ri :
    194      NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
    195                !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
    196                [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
    197 }
    198 
    199 // Template for instructions which take three fp64 or fp32 args.  The
    200 // instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
    201 //
    202 // Also defines ftz (flush subnormal inputs and results to sign-preserving
    203 // zero) variants for fp32 functions.
    204 //
    205 // This multiclass should be used for nodes that cannot be folded into FMAs.
    206 // For nodes that can be folded into FMAs (i.e. adds and muls), use
    207 // F3_fma_component.
    208 multiclass F3<string OpcStr, SDNode OpNode> {
    209    def f64rr :
    210      NVPTXInst<(outs Float64Regs:$dst),
    211                (ins Float64Regs:$a, Float64Regs:$b),
    212                !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
    213                [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
    214    def f64ri :
    215      NVPTXInst<(outs Float64Regs:$dst),
    216                (ins Float64Regs:$a, f64imm:$b),
    217                !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
    218                [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
    219    def f32rr_ftz :
    220      NVPTXInst<(outs Float32Regs:$dst),
    221                (ins Float32Regs:$a, Float32Regs:$b),
    222                !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
    223                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
    224                Requires<[doF32FTZ]>;
    225    def f32ri_ftz :
    226      NVPTXInst<(outs Float32Regs:$dst),
    227                (ins Float32Regs:$a, f32imm:$b),
    228                !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
    229                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
    230                Requires<[doF32FTZ]>;
    231    def f32rr :
    232      NVPTXInst<(outs Float32Regs:$dst),
    233                (ins Float32Regs:$a, Float32Regs:$b),
    234                !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
    235                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
    236    def f32ri :
    237      NVPTXInst<(outs Float32Regs:$dst),
    238                (ins Float32Regs:$a, f32imm:$b),
    239                !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
    240                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
    241 }
    242 
    243 // Template for instructions which take three FP args.  The
    244 // instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
    245 //
    246 // Also defines ftz (flush subnormal inputs and results to sign-preserving
    247 // zero) variants for fp32/fp16 functions.
    248 //
    249 // This multiclass should be used for nodes that can be folded to make fma ops.
    250 // In this case, we use the ".rn" variant when FMA is disabled, as this behaves
    251 // just like the non ".rn" op, but prevents ptxas from creating FMAs.
    252 multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
    253    def f64rr :
    254      NVPTXInst<(outs Float64Regs:$dst),
    255                (ins Float64Regs:$a, Float64Regs:$b),
    256                !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
    257                [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
    258                Requires<[allowFMA]>;
    259    def f64ri :
    260      NVPTXInst<(outs Float64Regs:$dst),
    261                (ins Float64Regs:$a, f64imm:$b),
    262                !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
    263                [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
    264                Requires<[allowFMA]>;
    265    def f32rr_ftz :
    266      NVPTXInst<(outs Float32Regs:$dst),
    267                (ins Float32Regs:$a, Float32Regs:$b),
    268                !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
    269                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
    270                Requires<[allowFMA, doF32FTZ]>;
    271    def f32ri_ftz :
    272      NVPTXInst<(outs Float32Regs:$dst),
    273                (ins Float32Regs:$a, f32imm:$b),
    274                !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
    275                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
    276                Requires<[allowFMA, doF32FTZ]>;
    277    def f32rr :
    278      NVPTXInst<(outs Float32Regs:$dst),
    279                (ins Float32Regs:$a, Float32Regs:$b),
    280                !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
    281                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
    282                Requires<[allowFMA]>;
    283    def f32ri :
    284      NVPTXInst<(outs Float32Regs:$dst),
    285                (ins Float32Regs:$a, f32imm:$b),
    286                !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
    287                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
    288                Requires<[allowFMA]>;
    289 
    290    def f16rr_ftz :
    291      NVPTXInst<(outs Float16Regs:$dst),
    292                (ins Float16Regs:$a, Float16Regs:$b),
    293                !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
    294                [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
    295                Requires<[useFP16Math, allowFMA, doF32FTZ]>;
    296    def f16rr :
    297      NVPTXInst<(outs Float16Regs:$dst),
    298                (ins Float16Regs:$a, Float16Regs:$b),
    299                !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
    300                [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
    301                Requires<[useFP16Math, allowFMA]>;
    302 
    303    def f16x2rr_ftz :
    304      NVPTXInst<(outs Float16x2Regs:$dst),
    305                (ins Float16x2Regs:$a, Float16x2Regs:$b),
    306                !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
    307                [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
    308                Requires<[useFP16Math, allowFMA, doF32FTZ]>;
    309    def f16x2rr :
    310      NVPTXInst<(outs Float16x2Regs:$dst),
    311                (ins Float16x2Regs:$a, Float16x2Regs:$b),
    312                !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
    313                [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
    314                Requires<[useFP16Math, allowFMA]>;
    315 
    316    // These have strange names so we don't perturb existing mir tests.
    317    def _rnf64rr :
    318      NVPTXInst<(outs Float64Regs:$dst),
    319                (ins Float64Regs:$a, Float64Regs:$b),
    320                !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
    321                [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
    322                Requires<[noFMA]>;
    323    def _rnf64ri :
    324      NVPTXInst<(outs Float64Regs:$dst),
    325                (ins Float64Regs:$a, f64imm:$b),
    326                !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
    327                [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
    328                Requires<[noFMA]>;
    329    def _rnf32rr_ftz :
    330      NVPTXInst<(outs Float32Regs:$dst),
    331                (ins Float32Regs:$a, Float32Regs:$b),
    332                !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
    333                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
    334                Requires<[noFMA, doF32FTZ]>;
    335    def _rnf32ri_ftz :
    336      NVPTXInst<(outs Float32Regs:$dst),
    337                (ins Float32Regs:$a, f32imm:$b),
    338                !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
    339                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
    340                Requires<[noFMA, doF32FTZ]>;
    341    def _rnf32rr :
    342      NVPTXInst<(outs Float32Regs:$dst),
    343                (ins Float32Regs:$a, Float32Regs:$b),
    344                !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
    345                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
    346                Requires<[noFMA]>;
    347    def _rnf32ri :
    348      NVPTXInst<(outs Float32Regs:$dst),
    349                (ins Float32Regs:$a, f32imm:$b),
    350                !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
    351                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
    352                Requires<[noFMA]>;
    353    def _rnf16rr_ftz :
    354      NVPTXInst<(outs Float16Regs:$dst),
    355                (ins Float16Regs:$a, Float16Regs:$b),
    356                !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"),
    357                [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
    358                Requires<[useFP16Math, noFMA, doF32FTZ]>;
    359    def _rnf16rr :
    360      NVPTXInst<(outs Float16Regs:$dst),
    361                (ins Float16Regs:$a, Float16Regs:$b),
    362                !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"),
    363                [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
    364                Requires<[useFP16Math, noFMA]>;
    365    def _rnf16x2rr_ftz :
    366      NVPTXInst<(outs Float16x2Regs:$dst),
    367                (ins Float16x2Regs:$a, Float16x2Regs:$b),
    368                !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"),
    369                [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
    370                Requires<[useFP16Math, noFMA, doF32FTZ]>;
    371    def _rnf16x2rr :
    372      NVPTXInst<(outs Float16x2Regs:$dst),
    373                (ins Float16x2Regs:$a, Float16x2Regs:$b),
    374                !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"),
    375                [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
    376                Requires<[useFP16Math, noFMA]>;
    377 }
    378 
    379 // Template for operations which take two f32 or f64 operands.  Provides three
    380 // instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
    381 // subnormal inputs and results to zero).
    382 multiclass F2<string OpcStr, SDNode OpNode> {
    383    def f64 :     NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
    384                            !strconcat(OpcStr, ".f64 \t$dst, $a;"),
    385                            [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
    386    def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
    387                            !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
    388                            [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
    389                            Requires<[doF32FTZ]>;
    390    def f32 :     NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
    391                            !strconcat(OpcStr, ".f32 \t$dst, $a;"),
    392                            [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
    393 }
    394 
    395 //===----------------------------------------------------------------------===//
    396 // NVPTX Instructions.
    397 //===----------------------------------------------------------------------===//
    398 
    399 //-----------------------------------
    400 // Type Conversion
    401 //-----------------------------------
    402 
    403 let hasSideEffects = 0 in {
    404   // Generate a cvt to the given type from all possible types.  Each instance
    405   // takes a CvtMode immediate that defines the conversion mode to use.  It can
    406   // be CvtNONE to omit a conversion mode.
    407   multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> {
    408     def _s8 :
    409       NVPTXInst<(outs RC:$dst),
    410                 (ins Int16Regs:$src, CvtMode:$mode),
    411                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
    412                 FromName, ".s8 \t$dst, $src;"), []>;
    413     def _u8 :
    414       NVPTXInst<(outs RC:$dst),
    415                 (ins Int16Regs:$src, CvtMode:$mode),
    416                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
    417                 FromName, ".u8 \t$dst, $src;"), []>;
    418     def _s16 :
    419       NVPTXInst<(outs RC:$dst),
    420                 (ins Int16Regs:$src, CvtMode:$mode),
    421                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
    422                 FromName, ".s16 \t$dst, $src;"), []>;
    423     def _u16 :
    424       NVPTXInst<(outs RC:$dst),
    425                 (ins Int16Regs:$src, CvtMode:$mode),
    426                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
    427                 FromName, ".u16 \t$dst, $src;"), []>;
    428     def _s32 :
    429       NVPTXInst<(outs RC:$dst),
    430                 (ins Int32Regs:$src, CvtMode:$mode),
    431                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
    432                 FromName, ".s32 \t$dst, $src;"), []>;
    433     def _u32 :
    434       NVPTXInst<(outs RC:$dst),
    435                 (ins Int32Regs:$src, CvtMode:$mode),
    436                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
    437                 FromName, ".u32 \t$dst, $src;"), []>;
    438     def _s64 :
    439       NVPTXInst<(outs RC:$dst),
    440                 (ins Int64Regs:$src, CvtMode:$mode),
    441                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
    442                 FromName, ".s64 \t$dst, $src;"), []>;
    443     def _u64 :
    444       NVPTXInst<(outs RC:$dst),
    445                 (ins Int64Regs:$src, CvtMode:$mode),
    446                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
    447                 FromName, ".u64 \t$dst, $src;"), []>;
    448     def _f16 :
    449       NVPTXInst<(outs RC:$dst),
    450                 (ins Float16Regs:$src, CvtMode:$mode),
    451                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
    452                 FromName, ".f16 \t$dst, $src;"), []>;
    453     def _f32 :
    454       NVPTXInst<(outs RC:$dst),
    455                 (ins Float32Regs:$src, CvtMode:$mode),
    456                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
    457                 FromName, ".f32 \t$dst, $src;"), []>;
    458     def _f64 :
    459       NVPTXInst<(outs RC:$dst),
    460                 (ins Float64Regs:$src, CvtMode:$mode),
    461                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
    462                 FromName, ".f64 \t$dst, $src;"), []>;
    463   }
    464 
    465   // Generate cvts from all types to all types.
    466   defm CVT_s8  : CVT_FROM_ALL<"s8",  Int16Regs>;
    467   defm CVT_u8  : CVT_FROM_ALL<"u8",  Int16Regs>;
    468   defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
    469   defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
    470   defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
    471   defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
    472   defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
    473   defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
    474   defm CVT_f16 : CVT_FROM_ALL<"f16", Float16Regs>;
    475   defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
    476   defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
    477 
    478   // These cvts are different from those above: The source and dest registers
    479   // are of the same type.
    480   def CVT_INREG_s16_s8 :  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
    481                                     "cvt.s16.s8 \t$dst, $src;", []>;
    482   def CVT_INREG_s32_s8 :  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
    483                                     "cvt.s32.s8 \t$dst, $src;", []>;
    484   def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
    485                                     "cvt.s32.s16 \t$dst, $src;", []>;
    486   def CVT_INREG_s64_s8 :  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
    487                                     "cvt.s64.s8 \t$dst, $src;", []>;
    488   def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
    489                                     "cvt.s64.s16 \t$dst, $src;", []>;
    490   def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
    491                                     "cvt.s64.s32 \t$dst, $src;", []>;
    492 }
    493 
    494 //-----------------------------------
    495 // Integer Arithmetic
    496 //-----------------------------------
    497 
    498 // Template for xor masquerading as int1 arithmetic.
    499 multiclass ADD_SUB_i1<SDNode OpNode> {
    500    def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
    501                       "xor.pred \t$dst, $a, $b;",
    502                       [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
    503    def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
    504                       "xor.pred \t$dst, $a, $b;",
    505                       [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
    506 }
    507 
    508 // int1 addition and subtraction are both just xor.
    509 defm ADD_i1 : ADD_SUB_i1<add>;
    510 defm SUB_i1 : ADD_SUB_i1<sub>;
    511 
    512 // int16, int32, and int64 signed addition.  Since nvptx is 2's complement, we
    513 // also use these for unsigned arithmetic.
    514 defm ADD : I3<"add.s", add>;
    515 defm SUB : I3<"sub.s", sub>;
    516 
    517 // int32 addition and subtraction with carry-out.
    518 // FIXME: PTX 4.3 adds a 64-bit add.cc (and maybe also 64-bit addc.cc?).
    519 defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>;
    520 defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>;
    521 
    522 // int32 addition and subtraction with carry-in and carry-out.
    523 defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>;
    524 defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>;
    525 
    526 defm MULT : I3<"mul.lo.s", mul>;
    527 
    528 defm MULTHS : I3<"mul.hi.s", mulhs>;
    529 defm MULTHU : I3<"mul.hi.u", mulhu>;
    530 
    531 defm SDIV : I3<"div.s", sdiv>;
    532 defm UDIV : I3<"div.u", udiv>;
    533 
    534 // The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
    535 // will lower it.
    536 defm SREM : I3<"rem.s", srem>;
    537 defm UREM : I3<"rem.u", urem>;
    538 
    539 // Integer absolute value.  NumBits should be one minus the bit width of RC.
    540 // This idiom implements the algorithm at
    541 // http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs.
    542 multiclass ABS<RegisterClass RC, string SizeName> {
    543   def : NVPTXInst<(outs RC:$dst), (ins RC:$a),
    544                   !strconcat("abs", SizeName, " \t$dst, $a;"),
    545                   [(set RC:$dst, (abs RC:$a))]>;
    546 }
    547 defm ABS_16 : ABS<Int16Regs, ".s16">;
    548 defm ABS_32 : ABS<Int32Regs, ".s32">;
    549 defm ABS_64 : ABS<Int64Regs, ".s64">;
    550 
    551 // Integer min/max.
    552 defm SMAX : I3<"max.s", smax>;
    553 defm UMAX : I3<"max.u", umax>;
    554 defm SMIN : I3<"min.s", smin>;
    555 defm UMIN : I3<"min.u", umin>;
    556 
    557 //
    558 // Wide multiplication
    559 //
    560 def MULWIDES64 :
    561   NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
    562             "mul.wide.s32 \t$dst, $a, $b;", []>;
    563 def MULWIDES64Imm :
    564   NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
    565             "mul.wide.s32 \t$dst, $a, $b;", []>;
    566 def MULWIDES64Imm64 :
    567   NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
    568             "mul.wide.s32 \t$dst, $a, $b;", []>;
    569 
    570 def MULWIDEU64 :
    571   NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
    572             "mul.wide.u32 \t$dst, $a, $b;", []>;
    573 def MULWIDEU64Imm :
    574   NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
    575             "mul.wide.u32 \t$dst, $a, $b;", []>;
    576 def MULWIDEU64Imm64 :
    577   NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
    578             "mul.wide.u32 \t$dst, $a, $b;", []>;
    579 
    580 def MULWIDES32 :
    581   NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
    582             "mul.wide.s16 \t$dst, $a, $b;", []>;
    583 def MULWIDES32Imm :
    584   NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
    585             "mul.wide.s16 \t$dst, $a, $b;", []>;
    586 def MULWIDES32Imm32 :
    587   NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
    588             "mul.wide.s16 \t$dst, $a, $b;", []>;
    589 
    590 def MULWIDEU32 :
    591   NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
    592             "mul.wide.u16 \t$dst, $a, $b;", []>;
    593 def MULWIDEU32Imm :
    594   NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
    595             "mul.wide.u16 \t$dst, $a, $b;", []>;
    596 def MULWIDEU32Imm32 :
    597   NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
    598             "mul.wide.u16 \t$dst, $a, $b;", []>;
    599 
    600 def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
    601 def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
    602 def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
    603 
    604 // Matchers for signed, unsigned mul.wide ISD nodes.
    605 def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)),
    606           (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
    607       Requires<[doMulWide]>;
    608 def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
    609           (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
    610       Requires<[doMulWide]>;
    611 def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)),
    612           (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
    613       Requires<[doMulWide]>;
    614 def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
    615           (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
    616       Requires<[doMulWide]>;
    617 
    618 def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)),
    619           (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
    620       Requires<[doMulWide]>;
    621 def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
    622           (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
    623       Requires<[doMulWide]>;
    624 def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)),
    625           (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
    626       Requires<[doMulWide]>;
    627 def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
    628           (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
    629       Requires<[doMulWide]>;
    630 
    631 // Predicates used for converting some patterns to mul.wide.
    632 def SInt32Const : PatLeaf<(imm), [{
    633   const APInt &v = N->getAPIntValue();
    634   return v.isSignedIntN(32);
    635 }]>;
    636 
    637 def UInt32Const : PatLeaf<(imm), [{
    638   const APInt &v = N->getAPIntValue();
    639   return v.isIntN(32);
    640 }]>;
    641 
    642 def SInt16Const : PatLeaf<(imm), [{
    643   const APInt &v = N->getAPIntValue();
    644   return v.isSignedIntN(16);
    645 }]>;
    646 
    647 def UInt16Const : PatLeaf<(imm), [{
    648   const APInt &v = N->getAPIntValue();
    649   return v.isIntN(16);
    650 }]>;
    651 
    652 def Int5Const : PatLeaf<(imm), [{
    653   // Check if 0 <= v < 32; only then will the result of (x << v) be an int32.
    654   const APInt &v = N->getAPIntValue();
    655   return v.sge(0) && v.slt(32);
    656 }]>;
    657 
    658 def Int4Const : PatLeaf<(imm), [{
    659   // Check if 0 <= v < 16; only then will the result of (x << v) be an int16.
    660   const APInt &v = N->getAPIntValue();
    661   return v.sge(0) && v.slt(16);
    662 }]>;
    663 
    664 def SHL2MUL32 : SDNodeXForm<imm, [{
    665   const APInt &v = N->getAPIntValue();
    666   APInt temp(32, 1);
    667   return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32);
    668 }]>;
    669 
    670 def SHL2MUL16 : SDNodeXForm<imm, [{
    671   const APInt &v = N->getAPIntValue();
    672   APInt temp(16, 1);
    673   return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
    674 }]>;
    675 
    676 // Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
    677 def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
    678           (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
    679       Requires<[doMulWide]>;
    680 def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)),
    681           (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
    682       Requires<[doMulWide]>;
    683 
    684 def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)),
    685           (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
    686       Requires<[doMulWide]>;
    687 def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)),
    688           (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
    689       Requires<[doMulWide]>;
    690 
    691 // Convert "sign/zero-extend then multiply" to mul.wide.
    692 def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
    693           (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
    694       Requires<[doMulWide]>;
    695 def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
    696           (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
    697       Requires<[doMulWide]>;
    698 
    699 def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
    700           (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
    701       Requires<[doMulWide]>;
    702 def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
    703           (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
    704       Requires<[doMulWide]>;
    705 
    706 def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
    707           (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
    708       Requires<[doMulWide]>;
    709 def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
    710           (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
    711       Requires<[doMulWide]>;
    712 
    713 def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
    714           (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
    715       Requires<[doMulWide]>;
    716 def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
    717           (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
    718       Requires<[doMulWide]>;
    719 
    720 //
    721 // Integer multiply-add
    722 //
    723 def SDTIMAD :
    724   SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>,
    725                        SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
    726 def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
    727 
    728 def MAD16rrr :
    729   NVPTXInst<(outs Int16Regs:$dst),
    730             (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
    731             "mad.lo.s16 \t$dst, $a, $b, $c;",
    732             [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
    733 def MAD16rri :
    734   NVPTXInst<(outs Int16Regs:$dst),
    735             (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
    736             "mad.lo.s16 \t$dst, $a, $b, $c;",
    737             [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
    738 def MAD16rir :
    739   NVPTXInst<(outs Int16Regs:$dst),
    740             (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
    741             "mad.lo.s16 \t$dst, $a, $b, $c;",
    742             [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
    743 def MAD16rii :
    744   NVPTXInst<(outs Int16Regs:$dst),
    745             (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
    746             "mad.lo.s16 \t$dst, $a, $b, $c;",
    747             [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>;
    748 
    749 def MAD32rrr :
    750   NVPTXInst<(outs Int32Regs:$dst),
    751             (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
    752             "mad.lo.s32 \t$dst, $a, $b, $c;",
    753             [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>;
    754 def MAD32rri :
    755   NVPTXInst<(outs Int32Regs:$dst),
    756             (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
    757             "mad.lo.s32 \t$dst, $a, $b, $c;",
    758             [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>;
    759 def MAD32rir :
    760   NVPTXInst<(outs Int32Regs:$dst),
    761             (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
    762             "mad.lo.s32 \t$dst, $a, $b, $c;",
    763             [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>;
    764 def MAD32rii :
    765   NVPTXInst<(outs Int32Regs:$dst),
    766             (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
    767             "mad.lo.s32 \t$dst, $a, $b, $c;",
    768             [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, imm:$c))]>;
    769 
    770 def MAD64rrr :
    771   NVPTXInst<(outs Int64Regs:$dst),
    772             (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
    773             "mad.lo.s64 \t$dst, $a, $b, $c;",
    774             [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
    775 def MAD64rri :
    776   NVPTXInst<(outs Int64Regs:$dst),
    777             (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
    778             "mad.lo.s64 \t$dst, $a, $b, $c;",
    779             [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
    780 def MAD64rir :
    781   NVPTXInst<(outs Int64Regs:$dst),
    782             (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
    783             "mad.lo.s64 \t$dst, $a, $b, $c;",
    784             [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
    785 def MAD64rii :
    786   NVPTXInst<(outs Int64Regs:$dst),
    787             (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
    788             "mad.lo.s64 \t$dst, $a, $b, $c;",
    789             [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>;
    790 
    791 def INEG16 :
    792   NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
    793             "neg.s16 \t$dst, $src;",
    794             [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
    795 def INEG32 :
    796   NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
    797             "neg.s32 \t$dst, $src;",
    798             [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>;
    799 def INEG64 :
    800   NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
    801             "neg.s64 \t$dst, $src;",
    802             [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
    803 
    804 //-----------------------------------
    805 // Floating Point Arithmetic
    806 //-----------------------------------
    807 
    808 // Constant 1.0f
    809 def FloatConst1 : PatLeaf<(fpimm), [{
    810   return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() &&
    811          N->getValueAPF().convertToFloat() == 1.0f;
    812 }]>;
    813 // Constant 1.0 (double)
    814 def DoubleConst1 : PatLeaf<(fpimm), [{
    815   return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
    816          N->getValueAPF().convertToDouble() == 1.0;
    817 }]>;
    818 
    819 // Loads FP16 constant into a register.
    820 //
    821 // ptxas does not have hex representation for fp16, so we can't use
    822 // fp16 immediate values in .f16 instructions. Instead we have to load
    823 // the constant into a register using mov.b16.
    824 def LOAD_CONST_F16 :
    825   NVPTXInst<(outs Float16Regs:$dst), (ins f16imm:$a),
    826             "mov.b16 \t$dst, $a;", []>;
    827 
    828 defm FADD : F3_fma_component<"add", fadd>;
    829 defm FSUB : F3_fma_component<"sub", fsub>;
    830 defm FMUL : F3_fma_component<"mul", fmul>;
    831 
    832 defm FMIN : F3<"min", fminnum>;
    833 defm FMAX : F3<"max", fmaxnum>;
    834 
    835 defm FABS  : F2<"abs", fabs>;
    836 defm FNEG  : F2<"neg", fneg>;
    837 defm FSQRT : F2<"sqrt.rn", fsqrt>;
    838 
    839 //
    840 // F64 division
    841 //
    842 def FDIV641r :
    843   NVPTXInst<(outs Float64Regs:$dst),
    844             (ins f64imm:$a, Float64Regs:$b),
    845             "rcp.rn.f64 \t$dst, $b;",
    846             [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
    847 def FDIV64rr :
    848   NVPTXInst<(outs Float64Regs:$dst),
    849             (ins Float64Regs:$a, Float64Regs:$b),
    850             "div.rn.f64 \t$dst, $a, $b;",
    851             [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>;
    852 def FDIV64ri :
    853   NVPTXInst<(outs Float64Regs:$dst),
    854             (ins Float64Regs:$a, f64imm:$b),
    855             "div.rn.f64 \t$dst, $a, $b;",
    856             [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>;
    857 
    858 //
    859 // F32 Approximate reciprocal
    860 //
    861 def FDIV321r_ftz :
    862   NVPTXInst<(outs Float32Regs:$dst),
    863             (ins f32imm:$a, Float32Regs:$b),
    864             "rcp.approx.ftz.f32 \t$dst, $b;",
    865             [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
    866             Requires<[do_DIVF32_APPROX, doF32FTZ]>;
    867 def FDIV321r :
    868   NVPTXInst<(outs Float32Regs:$dst),
    869             (ins f32imm:$a, Float32Regs:$b),
    870             "rcp.approx.f32 \t$dst, $b;",
    871             [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
    872             Requires<[do_DIVF32_APPROX]>;
    873 //
    874 // F32 Approximate division
    875 //
    876 def FDIV32approxrr_ftz :
    877   NVPTXInst<(outs Float32Regs:$dst),
    878             (ins Float32Regs:$a, Float32Regs:$b),
    879             "div.approx.ftz.f32 \t$dst, $a, $b;",
    880             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
    881             Requires<[do_DIVF32_APPROX, doF32FTZ]>;
    882 def FDIV32approxri_ftz :
    883   NVPTXInst<(outs Float32Regs:$dst),
    884             (ins Float32Regs:$a, f32imm:$b),
    885             "div.approx.ftz.f32 \t$dst, $a, $b;",
    886             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
    887             Requires<[do_DIVF32_APPROX, doF32FTZ]>;
    888 def FDIV32approxrr :
    889   NVPTXInst<(outs Float32Regs:$dst),
    890             (ins Float32Regs:$a, Float32Regs:$b),
    891             "div.approx.f32 \t$dst, $a, $b;",
    892             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
    893             Requires<[do_DIVF32_APPROX]>;
    894 def FDIV32approxri :
    895   NVPTXInst<(outs Float32Regs:$dst),
    896             (ins Float32Regs:$a, f32imm:$b),
    897             "div.approx.f32 \t$dst, $a, $b;",
    898             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
    899             Requires<[do_DIVF32_APPROX]>;
    900 //
    901 // F32 Semi-accurate reciprocal
    902 //
    903 // rcp.approx gives the same result as div.full(1.0f, a) and is faster.
    904 //
    905 def FDIV321r_approx_ftz :
    906   NVPTXInst<(outs Float32Regs:$dst),
    907             (ins f32imm:$a, Float32Regs:$b),
    908             "rcp.approx.ftz.f32 \t$dst, $b;",
    909             [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
    910             Requires<[do_DIVF32_FULL, doF32FTZ]>;
    911 def FDIV321r_approx :
    912   NVPTXInst<(outs Float32Regs:$dst),
    913             (ins f32imm:$a, Float32Regs:$b),
    914             "rcp.approx.f32 \t$dst, $b;",
    915             [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
    916             Requires<[do_DIVF32_FULL]>;
    917 //
    918 // F32 Semi-accurate division
    919 //
    920 def FDIV32rr_ftz :
    921   NVPTXInst<(outs Float32Regs:$dst),
    922             (ins Float32Regs:$a, Float32Regs:$b),
    923             "div.full.ftz.f32 \t$dst, $a, $b;",
    924             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
    925             Requires<[do_DIVF32_FULL, doF32FTZ]>;
    926 def FDIV32ri_ftz :
    927   NVPTXInst<(outs Float32Regs:$dst),
    928             (ins Float32Regs:$a, f32imm:$b),
    929             "div.full.ftz.f32 \t$dst, $a, $b;",
    930             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
    931             Requires<[do_DIVF32_FULL, doF32FTZ]>;
    932 def FDIV32rr :
    933   NVPTXInst<(outs Float32Regs:$dst),
    934             (ins Float32Regs:$a, Float32Regs:$b),
    935             "div.full.f32 \t$dst, $a, $b;",
    936             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
    937             Requires<[do_DIVF32_FULL]>;
    938 def FDIV32ri :
    939   NVPTXInst<(outs Float32Regs:$dst),
    940             (ins Float32Regs:$a, f32imm:$b),
    941             "div.full.f32 \t$dst, $a, $b;",
    942             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
    943             Requires<[do_DIVF32_FULL]>;
    944 //
    945 // F32 Accurate reciprocal
    946 //
    947 def FDIV321r_prec_ftz :
    948   NVPTXInst<(outs Float32Regs:$dst),
    949             (ins f32imm:$a, Float32Regs:$b),
    950             "rcp.rn.ftz.f32 \t$dst, $b;",
    951             [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
    952             Requires<[doF32FTZ]>;
    953 def FDIV321r_prec :
    954   NVPTXInst<(outs Float32Regs:$dst),
    955             (ins f32imm:$a, Float32Regs:$b),
    956             "rcp.rn.f32 \t$dst, $b;",
    957             [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>;
    958 //
    959 // F32 Accurate division
    960 //
    961 def FDIV32rr_prec_ftz :
    962   NVPTXInst<(outs Float32Regs:$dst),
    963             (ins Float32Regs:$a, Float32Regs:$b),
    964             "div.rn.ftz.f32 \t$dst, $a, $b;",
    965             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
    966             Requires<[doF32FTZ]>;
    967 def FDIV32ri_prec_ftz :
    968   NVPTXInst<(outs Float32Regs:$dst),
    969             (ins Float32Regs:$a, f32imm:$b),
    970             "div.rn.ftz.f32 \t$dst, $a, $b;",
    971             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
    972             Requires<[doF32FTZ]>;
    973 def FDIV32rr_prec :
    974   NVPTXInst<(outs Float32Regs:$dst),
    975             (ins Float32Regs:$a, Float32Regs:$b),
    976             "div.rn.f32 \t$dst, $a, $b;",
    977             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>;
    978 def FDIV32ri_prec :
    979   NVPTXInst<(outs Float32Regs:$dst),
    980             (ins Float32Regs:$a, f32imm:$b),
    981             "div.rn.f32 \t$dst, $a, $b;",
    982             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>;
    983 
    984 //
    985 // FMA
    986 //
    987 
    988 multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
    989    def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
    990                        !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
    991                        [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
    992                        Requires<[Pred]>;
    993    def rri : NVPTXInst<(outs RC:$dst),
    994                        (ins RC:$a, RC:$b, ImmCls:$c),
    995                        !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
    996                        [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
    997                        Requires<[Pred]>;
    998    def rir : NVPTXInst<(outs RC:$dst),
    999                        (ins RC:$a, ImmCls:$b, RC:$c),
   1000                        !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
   1001                        [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
   1002                        Requires<[Pred]>;
   1003    def rii : NVPTXInst<(outs RC:$dst),
   1004                        (ins RC:$a, ImmCls:$b, ImmCls:$c),
   1005                        !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
   1006                        [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
   1007                        Requires<[Pred]>;
   1008 }
   1009 
   1010 multiclass FMA_F16<string OpcStr, RegisterClass RC, Predicate Pred> {
   1011    def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
   1012                        !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
   1013                        [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
   1014                        Requires<[useFP16Math, Pred]>;
   1015 }
   1016 
   1017 defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", Float16Regs, doF32FTZ>;
   1018 defm FMA16     : FMA_F16<"fma.rn.f16", Float16Regs, true>;
   1019 defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", Float16x2Regs, doF32FTZ>;
   1020 defm FMA16x2     : FMA_F16<"fma.rn.f16x2", Float16x2Regs, true>;
   1021 defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
   1022 defm FMA32     : FMA<"fma.rn.f32", Float32Regs, f32imm, true>;
   1023 defm FMA64     : FMA<"fma.rn.f64", Float64Regs, f64imm, true>;
   1024 
   1025 // sin/cos
   1026 def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
   1027                       "sin.approx.f32 \t$dst, $src;",
   1028                       [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>,
   1029                       Requires<[allowUnsafeFPMath]>;
   1030 def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
   1031                       "cos.approx.f32 \t$dst, $src;",
   1032                       [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>,
   1033                       Requires<[allowUnsafeFPMath]>;
   1034 
   1035 // Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y)),
   1036 // i.e. "poor man's fmod()"
   1037 
   1038 // frem - f32 FTZ
   1039 def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
   1040           (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
   1041             (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRMI_FTZ),
   1042              Float32Regs:$y))>,
   1043           Requires<[doF32FTZ]>;
   1044 def : Pat<(frem Float32Regs:$x, fpimm:$y),
   1045           (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
   1046             (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRMI_FTZ),
   1047              fpimm:$y))>,
   1048           Requires<[doF32FTZ]>;
   1049 
   1050 // frem - f32
   1051 def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
   1052           (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
   1053             (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRMI),
   1054              Float32Regs:$y))>;
   1055 def : Pat<(frem Float32Regs:$x, fpimm:$y),
   1056           (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
   1057             (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRMI),
   1058              fpimm:$y))>;
   1059 
   1060 // frem - f64
   1061 def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
   1062           (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
   1063             (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRMI),
   1064              Float64Regs:$y))>;
   1065 def : Pat<(frem Float64Regs:$x, fpimm:$y),
   1066           (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
   1067             (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRMI),
   1068              fpimm:$y))>;
   1069 
   1070 //-----------------------------------
   1071 // Bitwise operations
   1072 //-----------------------------------
   1073 
   1074 // Template for three-arg bitwise operations.  Takes three args, Creates .b16,
   1075 // .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
   1076 multiclass BITWISE<string OpcStr, SDNode OpNode> {
   1077   def b1rr :
   1078     NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
   1079               !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
   1080               [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
   1081   def b1ri :
   1082     NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
   1083               !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
   1084               [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
   1085   def b16rr :
   1086     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
   1087               !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
   1088               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
   1089   def b16ri :
   1090     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
   1091               !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
   1092               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
   1093   def b32rr :
   1094     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
   1095               !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
   1096               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
   1097   def b32ri :
   1098     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
   1099               !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
   1100               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
   1101   def b64rr :
   1102     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
   1103               !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
   1104               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
   1105   def b64ri :
   1106     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
   1107               !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
   1108               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
   1109 }
   1110 
   1111 defm OR  : BITWISE<"or", or>;
   1112 defm AND : BITWISE<"and", and>;
   1113 defm XOR : BITWISE<"xor", xor>;
   1114 
   1115 def NOT1  : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
   1116                       "not.pred \t$dst, $src;",
   1117                       [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
   1118 def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
   1119                       "not.b16 \t$dst, $src;",
   1120                       [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
   1121 def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
   1122                       "not.b32 \t$dst, $src;",
   1123                       [(set Int32Regs:$dst, (not Int32Regs:$src))]>;
   1124 def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
   1125                        "not.b64 \t$dst, $src;",
   1126                        [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
   1127 
   1128 // Template for left/right shifts.  Takes three operands,
   1129 //   [dest (reg), src (reg), shift (reg or imm)].
   1130 // dest and src may be int64, int32, or int16, but shift is always int32.
   1131 //
   1132 // This template also defines a 32-bit shift (imm, imm) instruction.
   1133 multiclass SHIFT<string OpcStr, SDNode OpNode> {
   1134    def i64rr :
   1135      NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
   1136                !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
   1137                [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int32Regs:$b))]>;
   1138    def i64ri :
   1139      NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
   1140                !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
   1141                [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>;
   1142    def i32rr :
   1143      NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
   1144                !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
   1145                [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
   1146    def i32ri :
   1147      NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
   1148                !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
   1149                [(set Int32Regs:$dst, (OpNode Int32Regs:$a, (i32 imm:$b)))]>;
   1150    def i32ii :
   1151      NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
   1152                !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
   1153                [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
   1154    def i16rr :
   1155      NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
   1156                !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
   1157                [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int32Regs:$b))]>;
   1158    def i16ri :
   1159      NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
   1160                !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
   1161                [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>;
   1162 }
   1163 
   1164 defm SHL : SHIFT<"shl.b", shl>;
   1165 defm SRA : SHIFT<"shr.s", sra>;
   1166 defm SRL : SHIFT<"shr.u", srl>;
   1167 
   1168 // Bit-reverse
   1169 def BREV32 :
   1170   NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
   1171              "brev.b32 \t$dst, $a;",
   1172              [(set Int32Regs:$dst, (bitreverse Int32Regs:$a))]>;
   1173 def BREV64 :
   1174   NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a),
   1175              "brev.b64 \t$dst, $a;",
   1176              [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>;
   1177 
   1178 //
   1179 // Rotate: Use ptx shf instruction if available.
   1180 //
   1181 
   1182 // 32 bit r2 = rotl r1, n
   1183 //    =>
   1184 //        r2 = shf.l r1, r1, n
   1185 def ROTL32imm_hw :
   1186   NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
   1187             "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
   1188             [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
   1189            Requires<[hasHWROT32]>;
   1190 
   1191 def ROTL32reg_hw :
   1192   NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
   1193             "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
   1194             [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
   1195            Requires<[hasHWROT32]>;
   1196 
   1197 // 32 bit r2 = rotr r1, n
   1198 //    =>
   1199 //        r2 = shf.r r1, r1, n
   1200 def ROTR32imm_hw :
   1201   NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
   1202             "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
   1203             [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
   1204            Requires<[hasHWROT32]>;
   1205 
   1206 def ROTR32reg_hw :
   1207   NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
   1208             "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
   1209             [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
   1210            Requires<[hasHWROT32]>;
   1211 
   1212 // 32-bit software rotate by immediate.  $amt2 should equal 32 - $amt1.
   1213 def ROT32imm_sw :
   1214   NVPTXInst<(outs Int32Regs:$dst),
   1215             (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
   1216             "{{\n\t"
   1217             ".reg .b32 %lhs;\n\t"
   1218             ".reg .b32 %rhs;\n\t"
   1219             "shl.b32 \t%lhs, $src, $amt1;\n\t"
   1220             "shr.b32 \t%rhs, $src, $amt2;\n\t"
   1221             "add.u32 \t$dst, %lhs, %rhs;\n\t"
   1222             "}}",
   1223             []>;
   1224 
   1225 def SUB_FRM_32 : SDNodeXForm<imm, [{
   1226   return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
   1227 }]>;
   1228 
   1229 def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
   1230           (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
   1231       Requires<[noHWROT32]>;
   1232 def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
   1233           (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
   1234       Requires<[noHWROT32]>;
   1235 
   1236 // 32-bit software rotate left by register.
   1237 def ROTL32reg_sw :
   1238   NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
   1239             "{{\n\t"
   1240             ".reg .b32 %lhs;\n\t"
   1241             ".reg .b32 %rhs;\n\t"
   1242             ".reg .b32 %amt2;\n\t"
   1243             "shl.b32 \t%lhs, $src, $amt;\n\t"
   1244             "sub.s32 \t%amt2, 32, $amt;\n\t"
   1245             "shr.b32 \t%rhs, $src, %amt2;\n\t"
   1246             "add.u32 \t$dst, %lhs, %rhs;\n\t"
   1247             "}}",
   1248             [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
   1249            Requires<[noHWROT32]>;
   1250 
   1251 // 32-bit software rotate right by register.
   1252 def ROTR32reg_sw :
   1253   NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
   1254             "{{\n\t"
   1255             ".reg .b32 %lhs;\n\t"
   1256             ".reg .b32 %rhs;\n\t"
   1257             ".reg .b32 %amt2;\n\t"
   1258             "shr.b32 \t%lhs, $src, $amt;\n\t"
   1259             "sub.s32 \t%amt2, 32, $amt;\n\t"
   1260             "shl.b32 \t%rhs, $src, %amt2;\n\t"
   1261             "add.u32 \t$dst, %lhs, %rhs;\n\t"
   1262             "}}",
   1263             [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
   1264            Requires<[noHWROT32]>;
   1265 
   1266 // 64-bit software rotate by immediate.  $amt2 should equal 64 - $amt1.
   1267 def ROT64imm_sw :
   1268   NVPTXInst<(outs Int64Regs:$dst),
   1269             (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
   1270             "{{\n\t"
   1271             ".reg .b64 %lhs;\n\t"
   1272             ".reg .b64 %rhs;\n\t"
   1273             "shl.b64 \t%lhs, $src, $amt1;\n\t"
   1274             "shr.b64 \t%rhs, $src, $amt2;\n\t"
   1275             "add.u64 \t$dst, %lhs, %rhs;\n\t"
   1276             "}}",
   1277             []>;
   1278 
   1279 def SUB_FRM_64 : SDNodeXForm<imm, [{
   1280     return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
   1281 }]>;
   1282 
   1283 def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
   1284           (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
   1285 def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
   1286           (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
   1287 
   1288 // 64-bit software rotate left by register.
   1289 def ROTL64reg_sw :
   1290   NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
   1291             "{{\n\t"
   1292             ".reg .b64 %lhs;\n\t"
   1293             ".reg .b64 %rhs;\n\t"
   1294             ".reg .u32 %amt2;\n\t"
   1295             "shl.b64 \t%lhs, $src, $amt;\n\t"
   1296             "sub.u32 \t%amt2, 64, $amt;\n\t"
   1297             "shr.b64 \t%rhs, $src, %amt2;\n\t"
   1298             "add.u64 \t$dst, %lhs, %rhs;\n\t"
   1299             "}}",
   1300             [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>;
   1301 
   1302 def ROTR64reg_sw :
   1303   NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
   1304             "{{\n\t"
   1305             ".reg .b64 %lhs;\n\t"
   1306             ".reg .b64 %rhs;\n\t"
   1307             ".reg .u32 %amt2;\n\t"
   1308             "shr.b64 \t%lhs, $src, $amt;\n\t"
   1309             "sub.u32 \t%amt2, 64, $amt;\n\t"
   1310             "shl.b64 \t%rhs, $src, %amt2;\n\t"
   1311             "add.u64 \t$dst, %lhs, %rhs;\n\t"
   1312             "}}",
   1313             [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
   1314 
   1315 //
   1316 // Funnnel shift in clamp mode
   1317 //
   1318 
   1319 // Create SDNodes so they can be used in the DAG code, e.g.
   1320 // NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
   1321 def SDTIntShiftDOp :
   1322   SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
   1323                        SDTCisInt<0>, SDTCisInt<3>]>;
   1324 def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
   1325 def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
   1326 
   1327 def FUNSHFLCLAMP :
   1328   NVPTXInst<(outs Int32Regs:$dst),
   1329             (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
   1330             "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
   1331             [(set Int32Regs:$dst,
   1332               (FUN_SHFL_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
   1333 
   1334 def FUNSHFRCLAMP :
   1335   NVPTXInst<(outs Int32Regs:$dst),
   1336             (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
   1337             "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
   1338             [(set Int32Regs:$dst,
   1339              (FUN_SHFR_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
   1340 
   1341 //
   1342 // BFE - bit-field extract
   1343 //
   1344 
   1345 // Template for BFE instructions.  Takes four args,
   1346 //   [dest (reg), src (reg), start (reg or imm), end (reg or imm)].
   1347 // Start may be an imm only if end is also an imm.  FIXME: Is this a
   1348 // restriction in PTX?
   1349 //
   1350 // dest and src may be int32 or int64, but start and end are always int32.
   1351 multiclass BFE<string TyStr, RegisterClass RC> {
   1352   def rrr
   1353     : NVPTXInst<(outs RC:$d),
   1354                 (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
   1355                 !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
   1356   def rri
   1357     : NVPTXInst<(outs RC:$d),
   1358                 (ins RC:$a, Int32Regs:$b, i32imm:$c),
   1359                 !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
   1360   def rii
   1361     : NVPTXInst<(outs RC:$d),
   1362                 (ins RC:$a, i32imm:$b, i32imm:$c),
   1363                 !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
   1364 }
   1365 
   1366 let hasSideEffects = 0 in {
   1367   defm BFE_S32 : BFE<"s32", Int32Regs>;
   1368   defm BFE_U32 : BFE<"u32", Int32Regs>;
   1369   defm BFE_S64 : BFE<"s64", Int64Regs>;
   1370   defm BFE_U64 : BFE<"u64", Int64Regs>;
   1371 }
   1372 
   1373 //-----------------------------------
   1374 // Comparison instructions (setp, set)
   1375 //-----------------------------------
   1376 
   1377 // FIXME: This doesn't cover versions of set and setp that combine with a
   1378 // boolean predicate, e.g. setp.eq.and.b16.
   1379 
   1380 let hasSideEffects = 0 in {
   1381   multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
   1382     def rr :
   1383       NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
   1384                 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
   1385                            " \t$dst, $a, $b;"), []>;
   1386     def ri :
   1387       NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
   1388                 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
   1389                            " \t$dst, $a, $b;"), []>;
   1390     def ir :
   1391       NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
   1392                 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
   1393                            " \t$dst, $a, $b;"), []>;
   1394   }
   1395 }
   1396 
   1397 defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
   1398 defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>;
   1399 defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>;
   1400 defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>;
   1401 defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>;
   1402 defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>;
   1403 defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>;
   1404 defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
   1405 defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
   1406 defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
   1407 defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
   1408 def SETP_f16rr :
   1409       NVPTXInst<(outs Int1Regs:$dst),
   1410                 (ins Float16Regs:$a, Float16Regs:$b, CmpMode:$cmp),
   1411                 "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;",
   1412                 []>, Requires<[useFP16Math]>;
   1413 
   1414 def SETP_f16x2rr :
   1415       NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
   1416                 (ins Float16x2Regs:$a, Float16x2Regs:$b, CmpMode:$cmp),
   1417                 "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;",
   1418                 []>,
   1419                 Requires<[useFP16Math]>;
   1420 
   1421 
   1422 // FIXME: This doesn't appear to be correct.  The "set" mnemonic has the form
   1423 // "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
   1424 // reg, either u32, s32, or f32.  Anyway these aren't used at the moment.
   1425 
   1426 let hasSideEffects = 0 in {
   1427   multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
   1428     def rr : NVPTXInst<(outs Int32Regs:$dst),
   1429                        (ins RC:$a, RC:$b, CmpMode:$cmp),
   1430                        !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
   1431     def ri : NVPTXInst<(outs Int32Regs:$dst),
   1432                        (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
   1433                        !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
   1434     def ir : NVPTXInst<(outs Int32Regs:$dst),
   1435                        (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
   1436                        !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
   1437   }
   1438 }
   1439 
   1440 defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
   1441 defm SET_s16 : SET<"s16", Int16Regs, i16imm>;
   1442 defm SET_u16 : SET<"u16", Int16Regs, i16imm>;
   1443 defm SET_b32 : SET<"b32", Int32Regs, i32imm>;
   1444 defm SET_s32 : SET<"s32", Int32Regs, i32imm>;
   1445 defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
   1446 defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
   1447 defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
   1448 defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
   1449 defm SET_f16 : SET<"f16", Float16Regs, f16imm>;
   1450 defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
   1451 defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
   1452 
   1453 //-----------------------------------
   1454 // Selection instructions (selp)
   1455 //-----------------------------------
   1456 
   1457 // FIXME: Missing slct
   1458 
   1459 // selp instructions that don't have any pattern matches; we explicitly use
   1460 // them within this file.
   1461 let hasSideEffects = 0 in {
   1462   multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
   1463     def rr : NVPTXInst<(outs RC:$dst),
   1464                        (ins RC:$a, RC:$b, Int1Regs:$p),
   1465                        !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
   1466     def ri : NVPTXInst<(outs RC:$dst),
   1467                        (ins RC:$a, ImmCls:$b, Int1Regs:$p),
   1468                        !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
   1469     def ir : NVPTXInst<(outs RC:$dst),
   1470                        (ins ImmCls:$a, RC:$b, Int1Regs:$p),
   1471                        !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
   1472     def ii : NVPTXInst<(outs RC:$dst),
   1473                        (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
   1474                        !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
   1475   }
   1476 
   1477   multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls,
   1478                           SDNode ImmNode> {
   1479     def rr :
   1480       NVPTXInst<(outs RC:$dst),
   1481                 (ins RC:$a, RC:$b, Int1Regs:$p),
   1482                 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
   1483                 [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>;
   1484     def ri :
   1485       NVPTXInst<(outs RC:$dst),
   1486                 (ins RC:$a, ImmCls:$b, Int1Regs:$p),
   1487                 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
   1488                 [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>;
   1489     def ir :
   1490       NVPTXInst<(outs RC:$dst),
   1491                 (ins ImmCls:$a, RC:$b, Int1Regs:$p),
   1492                 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
   1493                 [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>;
   1494     def ii :
   1495       NVPTXInst<(outs RC:$dst),
   1496                 (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
   1497                 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
   1498                 [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
   1499   }
   1500 }
   1501 
   1502 // Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as
   1503 // good.
   1504 defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>;
   1505 defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
   1506 defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
   1507 defm SELP_b32 : SELP_PATTERN<"b32", Int32Regs, i32imm, imm>;
   1508 defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
   1509 defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
   1510 defm SELP_b64 : SELP_PATTERN<"b64", Int64Regs, i64imm, imm>;
   1511 defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
   1512 defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
   1513 defm SELP_f16 : SELP_PATTERN<"b16", Float16Regs, f16imm, fpimm>;
   1514 defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>;
   1515 defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>;
   1516 
   1517 def SELP_f16x2rr :
   1518     NVPTXInst<(outs Float16x2Regs:$dst),
   1519               (ins Float16x2Regs:$a, Float16x2Regs:$b, Int1Regs:$p),
   1520               "selp.b32 \t$dst, $a, $b, $p;",
   1521               [(set Float16x2Regs:$dst,
   1522                     (select Int1Regs:$p, Float16x2Regs:$a, Float16x2Regs:$b))]>;
   1523 
   1524 //-----------------------------------
   1525 // Data Movement (Load / Store, Move)
   1526 //-----------------------------------
   1527 
   1528 def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
   1529                             [SDNPWantRoot]>;
   1530 def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
   1531                               [SDNPWantRoot]>;
   1532 def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>;
   1533 
   1534 def MEMri : Operand<i32> {
   1535   let PrintMethod = "printMemOperand";
   1536   let MIOperandInfo = (ops Int32Regs, i32imm);
   1537 }
   1538 def MEMri64 : Operand<i64> {
   1539   let PrintMethod = "printMemOperand";
   1540   let MIOperandInfo = (ops Int64Regs, i64imm);
   1541 }
   1542 
   1543 def imem : Operand<iPTR> {
   1544   let PrintMethod = "printOperand";
   1545 }
   1546 
   1547 def imemAny : Operand<iPTRAny> {
   1548   let PrintMethod = "printOperand";
   1549 }
   1550 
   1551 def LdStCode : Operand<i32> {
   1552   let PrintMethod = "printLdStCode";
   1553 }
   1554 
   1555 def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
   1556 def Wrapper    : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
   1557 
   1558 // Load a memory address into a u32 or u64 register.
   1559 def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
   1560                          "mov.u32 \t$dst, $a;",
   1561                          [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
   1562 def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
   1563                            "mov.u64 \t$dst, $a;",
   1564                            [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
   1565 
   1566 // Get pointer to local stack.
   1567 let hasSideEffects = 0 in {
   1568   def MOV_DEPOT_ADDR :    NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
   1569                                      "mov.u32 \t$d, __local_depot$num;", []>;
   1570   def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
   1571                                     "mov.u64 \t$d, __local_depot$num;", []>;
   1572 }
   1573 
   1574 
   1575 // copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
   1576 let IsSimpleMove=1, hasSideEffects=0 in {
   1577   def IMOV1rr :  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
   1578                            "mov.pred \t$dst, $sss;", []>;
   1579   def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
   1580                            "mov.u16 \t$dst, $sss;", []>;
   1581   def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
   1582                            "mov.u32 \t$dst, $sss;", []>;
   1583   def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
   1584                            "mov.u64 \t$dst, $sss;", []>;
   1585 
   1586   def FMOV16rr : NVPTXInst<(outs Float16Regs:$dst), (ins Float16Regs:$src),
   1587                            // We have to use .b16 here as there's no mov.f16.
   1588                            "mov.b16 \t$dst, $src;", []>;
   1589   def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
   1590                            "mov.f32 \t$dst, $src;", []>;
   1591   def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
   1592                            "mov.f64 \t$dst, $src;", []>;
   1593 }
   1594 
   1595 def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
   1596                         "mov.pred \t$dst, $src;",
   1597                         [(set Int1Regs:$dst, imm:$src)]>;
   1598 def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
   1599                          "mov.u16 \t$dst, $src;",
   1600                          [(set Int16Regs:$dst, imm:$src)]>;
   1601 def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
   1602                          "mov.u32 \t$dst, $src;",
   1603                          [(set Int32Regs:$dst, imm:$src)]>;
   1604 def IMOV64i : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
   1605                         "mov.u64 \t$dst, $src;",
   1606                         [(set Int64Regs:$dst, imm:$src)]>;
   1607 
   1608 def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
   1609                          "mov.f32 \t$dst, $src;",
   1610                          [(set Float32Regs:$dst, fpimm:$src)]>;
   1611 def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
   1612                          "mov.f64 \t$dst, $src;",
   1613                          [(set Float64Regs:$dst, fpimm:$src)]>;
   1614 
   1615 def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
   1616 
   1617 //---- Copy Frame Index ----
   1618 def LEA_ADDRi :   NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
   1619                             "add.u32 \t$dst, ${addr:add};",
   1620                             [(set Int32Regs:$dst, ADDRri:$addr)]>;
   1621 def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
   1622                             "add.u64 \t$dst, ${addr:add};",
   1623                             [(set Int64Regs:$dst, ADDRri64:$addr)]>;
   1624 
   1625 //-----------------------------------
   1626 // Comparison and Selection
   1627 //-----------------------------------
   1628 
   1629 multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
   1630                        Instruction setp_16rr,
   1631                        Instruction setp_16ri,
   1632                        Instruction setp_16ir,
   1633                        Instruction setp_32rr,
   1634                        Instruction setp_32ri,
   1635                        Instruction setp_32ir,
   1636                        Instruction setp_64rr,
   1637                        Instruction setp_64ri,
   1638                        Instruction setp_64ir,
   1639                        Instruction set_16rr,
   1640                        Instruction set_16ri,
   1641                        Instruction set_16ir,
   1642                        Instruction set_32rr,
   1643                        Instruction set_32ri,
   1644                        Instruction set_32ir,
   1645                        Instruction set_64rr,
   1646                        Instruction set_64ri,
   1647                        Instruction set_64ir> {
   1648   // i16 -> pred
   1649   def : Pat<(i1 (OpNode Int16Regs:$a, Int16Regs:$b)),
   1650             (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
   1651   def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
   1652             (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
   1653   def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
   1654             (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
   1655   // i32 -> pred
   1656   def : Pat<(i1 (OpNode Int32Regs:$a, Int32Regs:$b)),
   1657             (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
   1658   def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)),
   1659             (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
   1660   def : Pat<(i1 (OpNode imm:$a, Int32Regs:$b)),
   1661             (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
   1662   // i64 -> pred
   1663   def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)),
   1664             (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
   1665   def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)),
   1666             (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
   1667   def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)),
   1668             (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
   1669 
   1670   // i16 -> i32
   1671   def : Pat<(i32 (OpNode Int16Regs:$a, Int16Regs:$b)),
   1672             (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
   1673   def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
   1674             (set_16ri Int16Regs:$a, imm:$b, Mode)>;
   1675   def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
   1676             (set_16ir imm:$a, Int16Regs:$b, Mode)>;
   1677   // i32 -> i32
   1678   def : Pat<(i32 (OpNode Int32Regs:$a, Int32Regs:$b)),
   1679             (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
   1680   def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)),
   1681             (set_32ri Int32Regs:$a, imm:$b, Mode)>;
   1682   def : Pat<(i32 (OpNode imm:$a, Int32Regs:$b)),
   1683             (set_32ir imm:$a, Int32Regs:$b, Mode)>;
   1684   // i64 -> i32
   1685   def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)),
   1686             (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
   1687   def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)),
   1688             (set_64ri Int64Regs:$a, imm:$b, Mode)>;
   1689   def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)),
   1690             (set_64ir imm:$a, Int64Regs:$b, Mode)>;
   1691 }
   1692 
   1693 multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
   1694   : ISET_FORMAT<OpNode, Mode,
   1695                 SETP_s16rr, SETP_s16ri, SETP_s16ir,
   1696                 SETP_s32rr, SETP_s32ri, SETP_s32ir,
   1697                 SETP_s64rr, SETP_s64ri, SETP_s64ir,
   1698                 SET_s16rr, SET_s16ri, SET_s16ir,
   1699                 SET_s32rr, SET_s32ri, SET_s32ir,
   1700                 SET_s64rr, SET_s64ri, SET_s64ir> {
   1701   // TableGen doesn't like empty multiclasses.
   1702   def : PatLeaf<(i32 0)>;
   1703 }
   1704 
   1705 multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
   1706   : ISET_FORMAT<OpNode, Mode,
   1707                 SETP_u16rr, SETP_u16ri, SETP_u16ir,
   1708                 SETP_u32rr, SETP_u32ri, SETP_u32ir,
   1709                 SETP_u64rr, SETP_u64ri, SETP_u64ir,
   1710                 SET_u16rr, SET_u16ri, SET_u16ir,
   1711                 SET_u32rr, SET_u32ri, SET_u32ir,
   1712                 SET_u64rr, SET_u64ri, SET_u64ir> {
   1713   // TableGen doesn't like empty multiclasses.
   1714   def : PatLeaf<(i32 0)>;
   1715 }
   1716 
   1717 defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
   1718 defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
   1719 defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
   1720 defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
   1721 defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
   1722 defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
   1723 defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
   1724 defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
   1725 defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
   1726 defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
   1727 defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
   1728 defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
   1729 
   1730 // i1 compares
   1731 def : Pat<(setne Int1Regs:$a, Int1Regs:$b),
   1732           (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
   1733 def : Pat<(setune Int1Regs:$a, Int1Regs:$b),
   1734           (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
   1735 
   1736 def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
   1737           (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
   1738 def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
   1739           (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
   1740 
   1741 // i1 compare -> i32
   1742 def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
   1743           (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
   1744 def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
   1745           (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
   1746 
   1747 
   1748 
   1749 multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
   1750   // f16 -> pred
   1751   def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)),
   1752             (SETP_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,
   1753         Requires<[useFP16Math,doF32FTZ]>;
   1754   def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)),
   1755             (SETP_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,
   1756         Requires<[useFP16Math]>;
   1757   def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)),
   1758             (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
   1759         Requires<[useFP16Math,doF32FTZ]>;
   1760   def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)),
   1761             (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
   1762         Requires<[useFP16Math]>;
   1763   def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)),
   1764             (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,
   1765         Requires<[useFP16Math,doF32FTZ]>;
   1766   def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)),
   1767             (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,
   1768         Requires<[useFP16Math]>;
   1769 
   1770   // f32 -> pred
   1771   def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
   1772             (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
   1773         Requires<[doF32FTZ]>;
   1774   def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
   1775             (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
   1776   def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
   1777             (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
   1778         Requires<[doF32FTZ]>;
   1779   def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
   1780             (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
   1781   def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
   1782             (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
   1783         Requires<[doF32FTZ]>;
   1784   def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
   1785             (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
   1786 
   1787   // f64 -> pred
   1788   def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
   1789             (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
   1790   def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
   1791             (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
   1792   def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
   1793             (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
   1794 
   1795   // f16 -> i32
   1796   def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)),
   1797             (SET_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,
   1798         Requires<[useFP16Math, doF32FTZ]>;
   1799   def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)),
   1800             (SET_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,
   1801         Requires<[useFP16Math]>;
   1802   def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)),
   1803             (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
   1804         Requires<[useFP16Math, doF32FTZ]>;
   1805   def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)),
   1806             (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
   1807         Requires<[useFP16Math]>;
   1808   def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)),
   1809             (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,
   1810         Requires<[useFP16Math, doF32FTZ]>;
   1811   def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)),
   1812             (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,
   1813         Requires<[useFP16Math]>;
   1814 
   1815   // f32 -> i32
   1816   def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
   1817             (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
   1818         Requires<[doF32FTZ]>;
   1819   def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
   1820             (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
   1821   def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
   1822             (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
   1823         Requires<[doF32FTZ]>;
   1824   def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
   1825             (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
   1826   def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
   1827             (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
   1828         Requires<[doF32FTZ]>;
   1829   def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
   1830             (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
   1831 
   1832   // f64 -> i32
   1833   def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)),
   1834             (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
   1835   def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)),
   1836             (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
   1837   def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)),
   1838             (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
   1839 }
   1840 
   1841 defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
   1842 defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
   1843 defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
   1844 defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
   1845 defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
   1846 defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
   1847 
   1848 defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
   1849 defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
   1850 defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>;
   1851 defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
   1852 defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
   1853 defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
   1854 
   1855 defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>;
   1856 defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>;
   1857 defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>;
   1858 defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>;
   1859 defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>;
   1860 defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
   1861 
   1862 defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
   1863 defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
   1864 
   1865 // FIXME: What is this doing here?  Can it be deleted?
   1866 // def ld_param         : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
   1867 //                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
   1868 
   1869 def SDTDeclareParamProfile :
   1870   SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
   1871 def SDTDeclareScalarParamProfile :
   1872   SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
   1873 def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
   1874 def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
   1875 def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
   1876 def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
   1877 def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
   1878 def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
   1879 def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
   1880 def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
   1881 def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
   1882 def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
   1883 def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
   1884 def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
   1885 def SDTCallValProfile : SDTypeProfile<1, 0, []>;
   1886 def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
   1887 def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
   1888 def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
   1889 def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
   1890 def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
   1891 
   1892 def DeclareParam :
   1893   SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
   1894          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
   1895 def DeclareScalarParam :
   1896   SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
   1897          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
   1898 def DeclareRetParam :
   1899   SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile,
   1900          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
   1901 def DeclareRet :
   1902   SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
   1903          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
   1904 def LoadParam :
   1905   SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
   1906          [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
   1907 def LoadParamV2 :
   1908   SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
   1909          [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
   1910 def LoadParamV4 :
   1911   SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
   1912          [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
   1913 def PrintCall :
   1914   SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
   1915          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
   1916 def PrintConvergentCall :
   1917   SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile,
   1918          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
   1919 def PrintCallUni :
   1920   SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
   1921          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
   1922 def PrintConvergentCallUni :
   1923   SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile,
   1924          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
   1925 def StoreParam :
   1926   SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
   1927          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
   1928 def StoreParamV2 :
   1929   SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
   1930          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
   1931 def StoreParamV4 :
   1932   SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
   1933          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
   1934 def StoreParamU32 :
   1935   SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
   1936          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
   1937 def StoreParamS32 :
   1938   SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
   1939          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
   1940 def CallArgBegin :
   1941   SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
   1942          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
   1943 def CallArg :
   1944   SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
   1945          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
   1946 def LastCallArg :
   1947   SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
   1948          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
   1949 def CallArgEnd :
   1950   SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
   1951          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
   1952 def CallVoid :
   1953   SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
   1954          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
   1955 def Prototype :
   1956   SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
   1957          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
   1958 def CallVal :
   1959   SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
   1960          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
   1961 def MoveParam :
   1962   SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
   1963 def StoreRetval :
   1964   SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
   1965          [SDNPHasChain, SDNPSideEffect]>;
   1966 def StoreRetvalV2 :
   1967   SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
   1968          [SDNPHasChain, SDNPSideEffect]>;
   1969 def StoreRetvalV4 :
   1970   SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
   1971          [SDNPHasChain, SDNPSideEffect]>;
   1972 def PseudoUseParam :
   1973   SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile,
   1974          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
   1975 def RETURNNode :
   1976   SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
   1977          [SDNPHasChain, SDNPSideEffect]>;
   1978 
   1979 let mayLoad = 1 in {
   1980   class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
   1981         NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
   1982                   !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"),
   1983                   []>;
   1984 
   1985   class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
   1986         NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
   1987                   !strconcat("ld.param.v2", opstr,
   1988                              " \t{{$dst, $dst2}}, [retval0+$b];"), []>;
   1989 
   1990   class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
   1991         NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
   1992                         regclass:$dst4),
   1993                   (ins i32imm:$b),
   1994                   !strconcat("ld.param.v4", opstr,
   1995                              " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
   1996                   []>;
   1997 }
   1998 
   1999 class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
   2000       NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
   2001                 !strconcat("mov", opstr, " \t$dst, retval$b;"),
   2002                 [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
   2003 
   2004 let mayStore = 1 in {
   2005   class StoreParamInst<NVPTXRegClass regclass, string opstr> :
   2006         NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
   2007                   !strconcat("st.param", opstr, " \t[param$a+$b], $val;"),
   2008                   []>;
   2009 
   2010   class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
   2011         NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
   2012                                i32imm:$a, i32imm:$b),
   2013                   !strconcat("st.param.v2", opstr,
   2014                              " \t[param$a+$b], {{$val, $val2}};"),
   2015                   []>;
   2016 
   2017   class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
   2018         NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3,
   2019                                regclass:$val4, i32imm:$a,
   2020                                i32imm:$b),
   2021                   !strconcat("st.param.v4", opstr,
   2022                              " \t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
   2023                   []>;
   2024 
   2025   class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
   2026         NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
   2027                   !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"),
   2028                   []>;
   2029 
   2030   class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
   2031         NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
   2032                   !strconcat("st.param.v2", opstr,
   2033                              " \t[func_retval0+$a], {{$val, $val2}};"),
   2034                   []>;
   2035 
   2036   class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
   2037         NVPTXInst<(outs),
   2038                   (ins regclass:$val, regclass:$val2, regclass:$val3,
   2039                        regclass:$val4, i32imm:$a),
   2040                   !strconcat("st.param.v4", opstr,
   2041                              " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
   2042                   []>;
   2043 }
   2044 
   2045 let isCall=1 in {
   2046   multiclass CALL<string OpcStr, SDNode OpNode> {
   2047      def PrintCallNoRetInst : NVPTXInst<(outs), (ins),
   2048        !strconcat(OpcStr, " "), [(OpNode (i32 0))]>;
   2049      def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
   2050        !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>;
   2051      def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
   2052        !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>;
   2053      def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
   2054        !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>;
   2055      def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
   2056        !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "),
   2057        [(OpNode (i32 4))]>;
   2058      def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
   2059        !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "),
   2060        [(OpNode (i32 5))]>;
   2061      def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
   2062        !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
   2063                             "retval5), "),
   2064        [(OpNode (i32 6))]>;
   2065      def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
   2066        !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
   2067                             "retval5, retval6), "),
   2068        [(OpNode (i32 7))]>;
   2069      def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
   2070        !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
   2071                             "retval5, retval6, retval7), "),
   2072        [(OpNode (i32 8))]>;
   2073   }
   2074 }
   2075 
   2076 defm Call : CALL<"call", PrintCall>;
   2077 defm CallUni : CALL<"call.uni", PrintCallUni>;
   2078 
   2079 // Convergent call instructions.  These are identical to regular calls, except
   2080 // they have the isConvergent bit set.
   2081 let isConvergent=1 in {
   2082   defm ConvergentCall : CALL<"call", PrintConvergentCall>;
   2083   defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>;
   2084 }
   2085 
   2086 def LoadParamMemI64    : LoadParamMemInst<Int64Regs, ".b64">;
   2087 def LoadParamMemI32    : LoadParamMemInst<Int32Regs, ".b32">;
   2088 def LoadParamMemI16    : LoadParamMemInst<Int16Regs, ".b16">;
   2089 def LoadParamMemI8     : LoadParamMemInst<Int16Regs, ".b8">;
   2090 def LoadParamMemV2I64  : LoadParamV2MemInst<Int64Regs, ".b64">;
   2091 def LoadParamMemV2I32  : LoadParamV2MemInst<Int32Regs, ".b32">;
   2092 def LoadParamMemV2I16  : LoadParamV2MemInst<Int16Regs, ".b16">;
   2093 def LoadParamMemV2I8   : LoadParamV2MemInst<Int16Regs, ".b8">;
   2094 def LoadParamMemV4I32  : LoadParamV4MemInst<Int32Regs, ".b32">;
   2095 def LoadParamMemV4I16  : LoadParamV4MemInst<Int16Regs, ".b16">;
   2096 def LoadParamMemV4I8   : LoadParamV4MemInst<Int16Regs, ".b8">;
   2097 def LoadParamMemF16    : LoadParamMemInst<Float16Regs, ".b16">;
   2098 def LoadParamMemF16x2  : LoadParamMemInst<Float16x2Regs, ".b32">;
   2099 def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".f32">;
   2100 def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".f64">;
   2101 def LoadParamMemV2F16  : LoadParamV2MemInst<Float16Regs, ".b16">;
   2102 def LoadParamMemV2F16x2: LoadParamV2MemInst<Float16x2Regs, ".b32">;
   2103 def LoadParamMemV2F32  : LoadParamV2MemInst<Float32Regs, ".f32">;
   2104 def LoadParamMemV2F64  : LoadParamV2MemInst<Float64Regs, ".f64">;
   2105 def LoadParamMemV4F16  : LoadParamV4MemInst<Float16Regs, ".b16">;
   2106 def LoadParamMemV4F16x2: LoadParamV4MemInst<Float16x2Regs, ".b32">;
   2107 def LoadParamMemV4F32  : LoadParamV4MemInst<Float32Regs, ".f32">;
   2108 
   2109 def StoreParamI64    : StoreParamInst<Int64Regs, ".b64">;
   2110 def StoreParamI32    : StoreParamInst<Int32Regs, ".b32">;
   2111 
   2112 def StoreParamI16    : StoreParamInst<Int16Regs, ".b16">;
   2113 def StoreParamI8     : StoreParamInst<Int16Regs, ".b8">;
   2114 def StoreParamV2I64  : StoreParamV2Inst<Int64Regs, ".b64">;
   2115 def StoreParamV2I32  : StoreParamV2Inst<Int32Regs, ".b32">;
   2116 def StoreParamV2I16  : StoreParamV2Inst<Int16Regs, ".b16">;
   2117 def StoreParamV2I8   : StoreParamV2Inst<Int16Regs, ".b8">;
   2118 
   2119 def StoreParamV4I32  : StoreParamV4Inst<Int32Regs, ".b32">;
   2120 def StoreParamV4I16  : StoreParamV4Inst<Int16Regs, ".b16">;
   2121 def StoreParamV4I8   : StoreParamV4Inst<Int16Regs, ".b8">;
   2122 
   2123 def StoreParamF16      : StoreParamInst<Float16Regs, ".b16">;
   2124 def StoreParamF16x2    : StoreParamInst<Float16x2Regs, ".b32">;
   2125 def StoreParamF32      : StoreParamInst<Float32Regs, ".f32">;
   2126 def StoreParamF64      : StoreParamInst<Float64Regs, ".f64">;
   2127 def StoreParamV2F16    : StoreParamV2Inst<Float16Regs, ".b16">;
   2128 def StoreParamV2F16x2  : StoreParamV2Inst<Float16x2Regs, ".b32">;
   2129 def StoreParamV2F32    : StoreParamV2Inst<Float32Regs, ".f32">;
   2130 def StoreParamV2F64    : StoreParamV2Inst<Float64Regs, ".f64">;
   2131 def StoreParamV4F16    : StoreParamV4Inst<Float16Regs, ".b16">;
   2132 def StoreParamV4F16x2  : StoreParamV4Inst<Float16x2Regs, ".b32">;
   2133 def StoreParamV4F32    : StoreParamV4Inst<Float32Regs, ".f32">;
   2134 
   2135 def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
   2136 def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
   2137 def StoreRetvalI16    : StoreRetvalInst<Int16Regs, ".b16">;
   2138 def StoreRetvalI8     : StoreRetvalInst<Int16Regs, ".b8">;
   2139 def StoreRetvalV2I64  : StoreRetvalV2Inst<Int64Regs, ".b64">;
   2140 def StoreRetvalV2I32  : StoreRetvalV2Inst<Int32Regs, ".b32">;
   2141 def StoreRetvalV2I16  : StoreRetvalV2Inst<Int16Regs, ".b16">;
   2142 def StoreRetvalV2I8   : StoreRetvalV2Inst<Int16Regs, ".b8">;
   2143 def StoreRetvalV4I32  : StoreRetvalV4Inst<Int32Regs, ".b32">;
   2144 def StoreRetvalV4I16  : StoreRetvalV4Inst<Int16Regs, ".b16">;
   2145 def StoreRetvalV4I8   : StoreRetvalV4Inst<Int16Regs, ".b8">;
   2146 
   2147 def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".f64">;
   2148 def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".f32">;
   2149 def StoreRetvalF16    : StoreRetvalInst<Float16Regs, ".b16">;
   2150 def StoreRetvalF16x2  : StoreRetvalInst<Float16x2Regs, ".b32">;
   2151 def StoreRetvalV2F64  : StoreRetvalV2Inst<Float64Regs, ".f64">;
   2152 def StoreRetvalV2F32  : StoreRetvalV2Inst<Float32Regs, ".f32">;
   2153 def StoreRetvalV2F16  : StoreRetvalV2Inst<Float16Regs, ".b16">;
   2154 def StoreRetvalV2F16x2: StoreRetvalV2Inst<Float16x2Regs, ".b32">;
   2155 def StoreRetvalV4F32  : StoreRetvalV4Inst<Float32Regs, ".f32">;
   2156 def StoreRetvalV4F16  : StoreRetvalV4Inst<Float16Regs, ".b16">;
   2157 def StoreRetvalV4F16x2: StoreRetvalV4Inst<Float16x2Regs, ".b32">;
   2158 
   2159 def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
   2160 def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
   2161 def CallArgEndInst0  : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
   2162 def RETURNInst       : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
   2163 
   2164 class CallArgInst<NVPTXRegClass regclass> :
   2165   NVPTXInst<(outs), (ins regclass:$a), "$a, ",
   2166             [(CallArg (i32 0), regclass:$a)]>;
   2167 
   2168 class LastCallArgInst<NVPTXRegClass regclass> :
   2169   NVPTXInst<(outs), (ins regclass:$a), "$a",
   2170             [(LastCallArg (i32 0), regclass:$a)]>;
   2171 
   2172 def CallArgI64     : CallArgInst<Int64Regs>;
   2173 def CallArgI32     : CallArgInst<Int32Regs>;
   2174 def CallArgI16     : CallArgInst<Int16Regs>;
   2175 def CallArgF64     : CallArgInst<Float64Regs>;
   2176 def CallArgF32     : CallArgInst<Float32Regs>;
   2177 
   2178 def LastCallArgI64 : LastCallArgInst<Int64Regs>;
   2179 def LastCallArgI32 : LastCallArgInst<Int32Regs>;
   2180 def LastCallArgI16 : LastCallArgInst<Int16Regs>;
   2181 def LastCallArgF64 : LastCallArgInst<Float64Regs>;
   2182 def LastCallArgF32 : LastCallArgInst<Float32Regs>;
   2183 
   2184 def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
   2185                               [(CallArg (i32 0), (i32 imm:$a))]>;
   2186 def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
   2187                                   [(LastCallArg (i32 0), (i32 imm:$a))]>;
   2188 
   2189 def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
   2190                              [(CallArg (i32 1), (i32 imm:$a))]>;
   2191 def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
   2192                                  [(LastCallArg (i32 1), (i32 imm:$a))]>;
   2193 
   2194 def CallVoidInst :      NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
   2195                                   [(CallVoid (Wrapper tglobaladdr:$addr))]>;
   2196 def CallVoidInstReg :   NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
   2197                                   [(CallVoid Int32Regs:$addr)]>;
   2198 def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
   2199                                   [(CallVoid Int64Regs:$addr)]>;
   2200 def PrototypeInst :     NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
   2201                                   [(Prototype (i32 imm:$val))]>;
   2202 
   2203 def DeclareRetMemInst :
   2204   NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num),
   2205             ".param .align $align .b8 retval$num[$size];",
   2206             [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
   2207 def DeclareRetScalarInst :
   2208   NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
   2209             ".param .b$size retval$num;",
   2210             [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
   2211 def DeclareRetRegInst :
   2212   NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
   2213             ".reg .b$size retval$num;",
   2214             [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
   2215 
   2216 def DeclareParamInst :
   2217   NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
   2218             ".param .align $align .b8 param$a[$size];",
   2219             [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
   2220 def DeclareScalarParamInst :
   2221   NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
   2222             ".param .b$size param$a;",
   2223             [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
   2224 def DeclareScalarRegInst :
   2225   NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
   2226             ".reg .b$size param$a;",
   2227             [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
   2228 
   2229 class MoveParamInst<NVPTXRegClass regclass, string asmstr> :
   2230   NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
   2231             !strconcat("mov", asmstr, " \t$dst, $src;"),
   2232             [(set regclass:$dst, (MoveParam regclass:$src))]>;
   2233 
   2234 def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">;
   2235 def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;
   2236 def MoveParamI16 :
   2237   NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
   2238             "cvt.u16.u32 \t$dst, $src;",
   2239             [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
   2240 def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">;
   2241 def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">;
   2242 def MoveParamF16 : MoveParamInst<Float16Regs, ".f16">;
   2243 
   2244 class PseudoUseParamInst<NVPTXRegClass regclass> :
   2245   NVPTXInst<(outs), (ins regclass:$src),
   2246             "// Pseudo use of $src",
   2247             [(PseudoUseParam regclass:$src)]>;
   2248 
   2249 def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;
   2250 def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;
   2251 def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>;
   2252 def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>;
   2253 def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;
   2254 
   2255 
   2256 //
   2257 // Load / Store Handling
   2258 //
   2259 multiclass LD<NVPTXRegClass regclass> {
   2260   def _avar : NVPTXInst<
   2261     (outs regclass:$dst),
   2262     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
   2263          i32imm:$fromWidth, imem:$addr),
   2264     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2265     "\t$dst, [$addr];", []>;
   2266   def _areg : NVPTXInst<
   2267     (outs regclass:$dst),
   2268     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
   2269          i32imm:$fromWidth, Int32Regs:$addr),
   2270     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2271     "\t$dst, [$addr];", []>;
   2272   def _areg_64 : NVPTXInst<
   2273     (outs regclass:$dst),
   2274     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
   2275          i32imm:$fromWidth, Int64Regs:$addr),
   2276     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2277     "\t$dst, [$addr];", []>;
   2278   def _ari : NVPTXInst<
   2279     (outs regclass:$dst),
   2280     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
   2281          i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
   2282     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2283     "\t$dst, [$addr+$offset];", []>;
   2284   def _ari_64 : NVPTXInst<
   2285     (outs regclass:$dst),
   2286     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
   2287          LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
   2288     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2289     "\t$dst, [$addr+$offset];", []>;
   2290   def _asi : NVPTXInst<
   2291     (outs regclass:$dst),
   2292     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
   2293          LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
   2294     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2295     "\t$dst, [$addr+$offset];", []>;
   2296 }
   2297 
   2298 let mayLoad=1, hasSideEffects=0 in {
   2299   defm LD_i8  : LD<Int16Regs>;
   2300   defm LD_i16 : LD<Int16Regs>;
   2301   defm LD_i32 : LD<Int32Regs>;
   2302   defm LD_i64 : LD<Int64Regs>;
   2303   defm LD_f16 : LD<Float16Regs>;
   2304   defm LD_f16x2 : LD<Float16x2Regs>;
   2305   defm LD_f32 : LD<Float32Regs>;
   2306   defm LD_f64 : LD<Float64Regs>;
   2307 }
   2308 
   2309 multiclass ST<NVPTXRegClass regclass> {
   2310   def _avar : NVPTXInst<
   2311     (outs),
   2312     (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
   2313          LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
   2314     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
   2315     " \t[$addr], $src;", []>;
   2316   def _areg : NVPTXInst<
   2317     (outs),
   2318     (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
   2319          LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
   2320     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
   2321     " \t[$addr], $src;", []>;
   2322   def _areg_64 : NVPTXInst<
   2323     (outs),
   2324     (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
   2325          LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
   2326     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
   2327     " \t[$addr], $src;", []>;
   2328   def _ari : NVPTXInst<
   2329     (outs),
   2330     (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
   2331          LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
   2332     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
   2333     " \t[$addr+$offset], $src;", []>;
   2334   def _ari_64 : NVPTXInst<
   2335     (outs),
   2336     (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
   2337          LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
   2338     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
   2339     " \t[$addr+$offset], $src;", []>;
   2340   def _asi : NVPTXInst<
   2341     (outs),
   2342     (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
   2343          LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
   2344     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
   2345     " \t[$addr+$offset], $src;", []>;
   2346 }
   2347 
   2348 let mayStore=1, hasSideEffects=0 in {
   2349   defm ST_i8  : ST<Int16Regs>;
   2350   defm ST_i16 : ST<Int16Regs>;
   2351   defm ST_i32 : ST<Int32Regs>;
   2352   defm ST_i64 : ST<Int64Regs>;
   2353   defm ST_f16 : ST<Float16Regs>;
   2354   defm ST_f16x2 : ST<Float16x2Regs>;
   2355   defm ST_f32 : ST<Float32Regs>;
   2356   defm ST_f64 : ST<Float64Regs>;
   2357 }
   2358 
   2359 // The following is used only in and after vector elementizations.  Vector
   2360 // elementization happens at the machine instruction level, so the following
   2361 // instructions never appear in the DAG.
   2362 multiclass LD_VEC<NVPTXRegClass regclass> {
   2363   def _v2_avar : NVPTXInst<
   2364     (outs regclass:$dst1, regclass:$dst2),
   2365     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
   2366          i32imm:$fromWidth, imem:$addr),
   2367     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2368     "\t{{$dst1, $dst2}}, [$addr];", []>;
   2369   def _v2_areg : NVPTXInst<
   2370     (outs regclass:$dst1, regclass:$dst2),
   2371     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
   2372          i32imm:$fromWidth, Int32Regs:$addr),
   2373     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2374     "\t{{$dst1, $dst2}}, [$addr];", []>;
   2375   def _v2_areg_64 : NVPTXInst<
   2376     (outs regclass:$dst1, regclass:$dst2),
   2377     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
   2378          i32imm:$fromWidth, Int64Regs:$addr),
   2379     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2380     "\t{{$dst1, $dst2}}, [$addr];", []>;
   2381   def _v2_ari : NVPTXInst<
   2382     (outs regclass:$dst1, regclass:$dst2),
   2383     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
   2384          i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
   2385     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2386     "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
   2387   def _v2_ari_64 : NVPTXInst<
   2388     (outs regclass:$dst1, regclass:$dst2),
   2389     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
   2390          i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
   2391     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2392     "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
   2393   def _v2_asi : NVPTXInst<
   2394     (outs regclass:$dst1, regclass:$dst2),
   2395     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
   2396          i32imm:$fromWidth, imem:$addr, i32imm:$offset),
   2397     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2398     "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
   2399   def _v4_avar : NVPTXInst<
   2400     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
   2401     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
   2402          i32imm:$fromWidth, imem:$addr),
   2403     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2404     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
   2405   def _v4_areg : NVPTXInst<
   2406     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
   2407     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
   2408          i32imm:$fromWidth, Int32Regs:$addr),
   2409     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2410     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
   2411   def _v4_areg_64 : NVPTXInst<
   2412     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
   2413     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
   2414          i32imm:$fromWidth, Int64Regs:$addr),
   2415     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2416     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
   2417   def _v4_ari : NVPTXInst<
   2418     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
   2419     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
   2420          i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
   2421     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2422     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
   2423   def _v4_ari_64 : NVPTXInst<
   2424     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
   2425     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
   2426          i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
   2427     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2428     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
   2429   def _v4_asi : NVPTXInst<
   2430     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
   2431     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
   2432          i32imm:$fromWidth, imem:$addr, i32imm:$offset),
   2433     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2434     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
   2435 }
   2436 let mayLoad=1, hasSideEffects=0 in {
   2437   defm LDV_i8  : LD_VEC<Int16Regs>;
   2438   defm LDV_i16 : LD_VEC<Int16Regs>;
   2439   defm LDV_i32 : LD_VEC<Int32Regs>;
   2440   defm LDV_i64 : LD_VEC<Int64Regs>;
   2441   defm LDV_f16 : LD_VEC<Float16Regs>;
   2442   defm LDV_f16x2 : LD_VEC<Float16x2Regs>;
   2443   defm LDV_f32 : LD_VEC<Float32Regs>;
   2444   defm LDV_f64 : LD_VEC<Float64Regs>;
   2445 }
   2446 
   2447 multiclass ST_VEC<NVPTXRegClass regclass> {
   2448   def _v2_avar : NVPTXInst<
   2449     (outs),
   2450     (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
   2451          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
   2452     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2453     "\t[$addr], {{$src1, $src2}};", []>;
   2454   def _v2_areg : NVPTXInst<
   2455     (outs),
   2456     (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
   2457          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
   2458     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2459     "\t[$addr], {{$src1, $src2}};", []>;
   2460   def _v2_areg_64 : NVPTXInst<
   2461     (outs),
   2462     (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
   2463          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
   2464     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2465     "\t[$addr], {{$src1, $src2}};", []>;
   2466   def _v2_ari : NVPTXInst<
   2467     (outs),
   2468     (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
   2469          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
   2470          i32imm:$offset),
   2471     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2472     "\t[$addr+$offset], {{$src1, $src2}};", []>;
   2473   def _v2_ari_64 : NVPTXInst<
   2474     (outs),
   2475     (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
   2476          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
   2477          i32imm:$offset),
   2478     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2479     "\t[$addr+$offset], {{$src1, $src2}};", []>;
   2480   def _v2_asi : NVPTXInst<
   2481     (outs),
   2482     (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
   2483          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
   2484          i32imm:$offset),
   2485     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2486     "\t[$addr+$offset], {{$src1, $src2}};", []>;
   2487   def _v4_avar : NVPTXInst<
   2488     (outs),
   2489     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
   2490          LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
   2491          i32imm:$fromWidth, imem:$addr),
   2492     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2493     "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
   2494   def _v4_areg : NVPTXInst<
   2495     (outs),
   2496     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
   2497          LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
   2498          i32imm:$fromWidth, Int32Regs:$addr),
   2499     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2500     "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
   2501   def _v4_areg_64 : NVPTXInst<
   2502     (outs),
   2503     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
   2504          LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
   2505          i32imm:$fromWidth, Int64Regs:$addr),
   2506     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2507     "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
   2508   def _v4_ari : NVPTXInst<
   2509     (outs),
   2510     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
   2511          LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
   2512          i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
   2513     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2514     "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
   2515   def _v4_ari_64 : NVPTXInst<
   2516     (outs),
   2517     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
   2518          LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
   2519          i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
   2520     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
   2521     "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
   2522   def _v4_asi : NVPTXInst<
   2523     (outs),
   2524     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
   2525          LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
   2526          i32imm:$fromWidth, imem:$addr, i32imm:$offset),
   2527     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
   2528     "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
   2529 }
   2530 
   2531 let mayStore=1, hasSideEffects=0 in {
   2532   defm STV_i8  : ST_VEC<Int16Regs>;
   2533   defm STV_i16 : ST_VEC<Int16Regs>;
   2534   defm STV_i32 : ST_VEC<Int32Regs>;
   2535   defm STV_i64 : ST_VEC<Int64Regs>;
   2536   defm STV_f16 : ST_VEC<Float16Regs>;
   2537   defm STV_f16x2 : ST_VEC<Float16x2Regs>;
   2538   defm STV_f32 : ST_VEC<Float32Regs>;
   2539   defm STV_f64 : ST_VEC<Float64Regs>;
   2540 }
   2541 
   2542 //---- Conversion ----
   2543 
   2544 class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn,
   2545   NVPTXRegClass regclassOut> :
   2546            NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
   2547            !strconcat("mov.b", !strconcat(SzStr, " \t$d, $a;")),
   2548      [(set regclassOut:$d, (bitconvert regclassIn:$a))]>;
   2549 
   2550 def BITCONVERT_16_I2F : F_BITCONVERT<"16", Int16Regs, Float16Regs>;
   2551 def BITCONVERT_16_F2I : F_BITCONVERT<"16", Float16Regs, Int16Regs>;
   2552 def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>;
   2553 def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>;
   2554 def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;
   2555 def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;
   2556 def BITCONVERT_32_I2F16x2 : F_BITCONVERT<"32", Int32Regs, Float16x2Regs>;
   2557 def BITCONVERT_32_F16x22I : F_BITCONVERT<"32", Float16x2Regs, Int32Regs>;
   2558 
   2559 // NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
   2560 // we cannot specify floating-point literals in isel patterns.  Therefore, we
   2561 // use an integer selp to select either 1 or 0 and then cvt to floating-point.
   2562 
   2563 // sint -> f16
   2564 def : Pat<(f16 (sint_to_fp Int1Regs:$a)),
   2565           (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
   2566 def : Pat<(f16 (sint_to_fp Int16Regs:$a)),
   2567           (CVT_f16_s16 Int16Regs:$a, CvtRN)>;
   2568 def : Pat<(f16 (sint_to_fp Int32Regs:$a)),
   2569           (CVT_f16_s32 Int32Regs:$a, CvtRN)>;
   2570 def : Pat<(f16 (sint_to_fp Int64Regs:$a)),
   2571           (CVT_f16_s64 Int64Regs:$a, CvtRN)>;
   2572 
   2573 // uint -> f16
   2574 def : Pat<(f16 (uint_to_fp Int1Regs:$a)),
   2575           (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
   2576 def : Pat<(f16 (uint_to_fp Int16Regs:$a)),
   2577           (CVT_f16_u16 Int16Regs:$a, CvtRN)>;
   2578 def : Pat<(f16 (uint_to_fp Int32Regs:$a)),
   2579           (CVT_f16_u32 Int32Regs:$a, CvtRN)>;
   2580 def : Pat<(f16 (uint_to_fp Int64Regs:$a)),
   2581           (CVT_f16_u64 Int64Regs:$a, CvtRN)>;
   2582 
   2583 // sint -> f32
   2584 def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
   2585           (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
   2586 def : Pat<(f32 (sint_to_fp Int16Regs:$a)),
   2587           (CVT_f32_s16 Int16Regs:$a, CvtRN)>;
   2588 def : Pat<(f32 (sint_to_fp Int32Regs:$a)),
   2589           (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
   2590 def : Pat<(f32 (sint_to_fp Int64Regs:$a)),
   2591           (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
   2592 
   2593 // uint -> f32
   2594 def : Pat<(f32 (uint_to_fp Int1Regs:$a)),
   2595           (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
   2596 def : Pat<(f32 (uint_to_fp Int16Regs:$a)),
   2597           (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
   2598 def : Pat<(f32 (uint_to_fp Int32Regs:$a)),
   2599           (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
   2600 def : Pat<(f32 (uint_to_fp Int64Regs:$a)),
   2601           (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
   2602 
   2603 // sint -> f64
   2604 def : Pat<(f64 (sint_to_fp Int1Regs:$a)),
   2605           (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
   2606 def : Pat<(f64 (sint_to_fp Int16Regs:$a)),
   2607           (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
   2608 def : Pat<(f64 (sint_to_fp Int32Regs:$a)),
   2609           (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
   2610 def : Pat<(f64 (sint_to_fp Int64Regs:$a)),
   2611           (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
   2612 
   2613 // uint -> f64
   2614 def : Pat<(f64 (uint_to_fp Int1Regs:$a)),
   2615           (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
   2616 def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
   2617           (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
   2618 def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
   2619           (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
   2620 def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
   2621           (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
   2622 
   2623 
   2624 // f16 -> sint
   2625 def : Pat<(i1 (fp_to_sint Float16Regs:$a)),
   2626           (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;
   2627 def : Pat<(i16 (fp_to_sint Float16Regs:$a)),
   2628           (CVT_s16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
   2629 def : Pat<(i16 (fp_to_sint Float16Regs:$a)),
   2630           (CVT_s16_f16 Float16Regs:$a, CvtRZI)>;
   2631 def : Pat<(i32 (fp_to_sint Float16Regs:$a)),
   2632           (CVT_s32_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
   2633 def : Pat<(i32 (fp_to_sint Float16Regs:$a)),
   2634           (CVT_s32_f16 Float16Regs:$a, CvtRZI)>;
   2635 def : Pat<(i64 (fp_to_sint Float16Regs:$a)),
   2636           (CVT_s64_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
   2637 def : Pat<(i64 (fp_to_sint Float16Regs:$a)),
   2638           (CVT_s64_f16 Float16Regs:$a, CvtRZI)>;
   2639 
   2640 // f16 -> uint
   2641 def : Pat<(i1 (fp_to_uint Float16Regs:$a)),
   2642           (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;
   2643 def : Pat<(i16 (fp_to_uint Float16Regs:$a)),
   2644           (CVT_u16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
   2645 def : Pat<(i16 (fp_to_uint Float16Regs:$a)),
   2646           (CVT_u16_f16 Float16Regs:$a, CvtRZI)>;
   2647 def : Pat<(i32 (fp_to_uint Float16Regs:$a)),
   2648           (CVT_u32_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
   2649 def : Pat<(i32 (fp_to_uint Float16Regs:$a)),
   2650           (CVT_u32_f16 Float16Regs:$a, CvtRZI)>;
   2651 def : Pat<(i64 (fp_to_uint Float16Regs:$a)),
   2652           (CVT_u64_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
   2653 def : Pat<(i64 (fp_to_uint Float16Regs:$a)),
   2654           (CVT_u64_f16 Float16Regs:$a, CvtRZI)>;
   2655 
   2656 // f32 -> sint
   2657 def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
   2658           (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
   2659 def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
   2660           (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
   2661 def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
   2662           (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
   2663 def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
   2664           (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
   2665 def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
   2666           (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
   2667 def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
   2668           (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
   2669 def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
   2670           (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
   2671 
   2672 // f32 -> uint
   2673 def : Pat<(i1 (fp_to_uint Float32Regs:$a)),
   2674           (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
   2675 def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
   2676           (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
   2677 def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
   2678           (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
   2679 def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
   2680           (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
   2681 def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
   2682           (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
   2683 def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
   2684           (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
   2685 def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
   2686           (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
   2687 
   2688 // f64 -> sint
   2689 def : Pat<(i1 (fp_to_sint Float64Regs:$a)),
   2690           (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
   2691 def : Pat<(i16 (fp_to_sint Float64Regs:$a)),
   2692           (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
   2693 def : Pat<(i32 (fp_to_sint Float64Regs:$a)),
   2694           (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
   2695 def : Pat<(i64 (fp_to_sint Float64Regs:$a)),
   2696           (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
   2697 
   2698 // f64 -> uint
   2699 def : Pat<(i1 (fp_to_uint Float64Regs:$a)),
   2700           (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
   2701 def : Pat<(i16 (fp_to_uint Float64Regs:$a)),
   2702           (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
   2703 def : Pat<(i32 (fp_to_uint Float64Regs:$a)),
   2704           (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
   2705 def : Pat<(i64 (fp_to_uint Float64Regs:$a)),
   2706           (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
   2707 
   2708 // sext i1
   2709 def : Pat<(i16 (sext Int1Regs:$a)),
   2710           (SELP_s16ii -1, 0, Int1Regs:$a)>;
   2711 def : Pat<(i32 (sext Int1Regs:$a)),
   2712           (SELP_s32ii -1, 0, Int1Regs:$a)>;
   2713 def : Pat<(i64 (sext Int1Regs:$a)),
   2714           (SELP_s64ii -1, 0, Int1Regs:$a)>;
   2715 
   2716 // zext i1
   2717 def : Pat<(i16 (zext Int1Regs:$a)),
   2718           (SELP_u16ii 1, 0, Int1Regs:$a)>;
   2719 def : Pat<(i32 (zext Int1Regs:$a)),
   2720           (SELP_u32ii 1, 0, Int1Regs:$a)>;
   2721 def : Pat<(i64 (zext Int1Regs:$a)),
   2722           (SELP_u64ii 1, 0, Int1Regs:$a)>;
   2723 
   2724 // anyext i1
   2725 def : Pat<(i16 (anyext Int1Regs:$a)),
   2726           (SELP_u16ii -1, 0, Int1Regs:$a)>;
   2727 def : Pat<(i32 (anyext Int1Regs:$a)),
   2728           (SELP_u32ii -1, 0, Int1Regs:$a)>;
   2729 def : Pat<(i64 (anyext Int1Regs:$a)),
   2730           (SELP_u64ii -1, 0, Int1Regs:$a)>;
   2731 
   2732 // sext i16
   2733 def : Pat<(i32 (sext Int16Regs:$a)),
   2734           (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
   2735 def : Pat<(i64 (sext Int16Regs:$a)),
   2736           (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
   2737 
   2738 // zext i16
   2739 def : Pat<(i32 (zext Int16Regs:$a)),
   2740           (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
   2741 def : Pat<(i64 (zext Int16Regs:$a)),
   2742           (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
   2743 
   2744 // anyext i16
   2745 def : Pat<(i32 (anyext Int16Regs:$a)),
   2746           (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
   2747 def : Pat<(i64 (anyext Int16Regs:$a)),
   2748           (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
   2749 
   2750 // sext i32
   2751 def : Pat<(i64 (sext Int32Regs:$a)),
   2752           (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
   2753 
   2754 // zext i32
   2755 def : Pat<(i64 (zext Int32Regs:$a)),
   2756           (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
   2757 
   2758 // anyext i32
   2759 def : Pat<(i64 (anyext Int32Regs:$a)),
   2760           (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
   2761 
   2762 
   2763 // truncate i64
   2764 def : Pat<(i32 (trunc Int64Regs:$a)),
   2765           (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
   2766 def : Pat<(i16 (trunc Int64Regs:$a)),
   2767           (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
   2768 def : Pat<(i1 (trunc Int64Regs:$a)),
   2769           (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
   2770 
   2771 // truncate i32
   2772 def : Pat<(i16 (trunc Int32Regs:$a)),
   2773           (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
   2774 def : Pat<(i1 (trunc Int32Regs:$a)),
   2775           (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
   2776 
   2777 // truncate i16
   2778 def : Pat<(i1 (trunc Int16Regs:$a)),
   2779           (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
   2780 
   2781 // sext_inreg
   2782 def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
   2783 def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
   2784 def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
   2785 def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
   2786 def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
   2787 def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
   2788 
   2789 
   2790 // Select instructions with 32-bit predicates
   2791 def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b),
   2792           (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
   2793           (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
   2794 def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
   2795           (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
   2796           (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
   2797 def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
   2798           (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
   2799           (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
   2800 def : Pat<(select Int32Regs:$pred, Float16Regs:$a, Float16Regs:$b),
   2801           (SELP_f16rr Float16Regs:$a, Float16Regs:$b,
   2802           (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
   2803 def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
   2804           (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
   2805           (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
   2806 def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
   2807           (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
   2808           (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
   2809 
   2810 
   2811 let hasSideEffects = 0 in {
   2812   // pack a set of smaller int registers to a larger int register
   2813   def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
   2814                              (ins Int16Regs:$s1, Int16Regs:$s2,
   2815                                   Int16Regs:$s3, Int16Regs:$s4),
   2816                              "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
   2817   def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
   2818                              (ins Int16Regs:$s1, Int16Regs:$s2),
   2819                              "mov.b32 \t$d, {{$s1, $s2}};", []>;
   2820   def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
   2821                              (ins Int32Regs:$s1, Int32Regs:$s2),
   2822                              "mov.b64 \t$d, {{$s1, $s2}};", []>;
   2823   def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
   2824                              (ins Float32Regs:$s1, Float32Regs:$s2),
   2825                              "mov.b64 \t$d, {{$s1, $s2}};", []>;
   2826 
   2827   // unpack a larger int register to a set of smaller int registers
   2828   def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
   2829                                    Int16Regs:$d3, Int16Regs:$d4),
   2830                              (ins Int64Regs:$s),
   2831                              "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
   2832   def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
   2833                              (ins Int32Regs:$s),
   2834                              "mov.b32 \t{{$d1, $d2}}, $s;", []>;
   2835   def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
   2836                              (ins Int64Regs:$s),
   2837                              "mov.b64 \t{{$d1, $d2}}, $s;", []>;
   2838   def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
   2839                              (ins Float64Regs:$s),
   2840                              "mov.b64 \t{{$d1, $d2}}, $s;", []>;
   2841 
   2842 }
   2843 
   2844 let hasSideEffects = 0 in {
   2845   // Extract element of f16x2 register. PTX does not provide any way
   2846   // to access elements of f16x2 vector directly, so we need to
   2847   // extract it using a temporary register.
   2848   def F16x2toF16_0 : NVPTXInst<(outs Float16Regs:$dst),
   2849                                (ins Float16x2Regs:$src),
   2850                                "{{ .reg .b16 \t%tmp_hi;\n\t"
   2851                                "  mov.b32 \t{$dst, %tmp_hi}, $src; }}",
   2852                                [(set Float16Regs:$dst,
   2853                                  (extractelt (v2f16 Float16x2Regs:$src), 0))]>;
   2854   def F16x2toF16_1 : NVPTXInst<(outs Float16Regs:$dst),
   2855                                (ins Float16x2Regs:$src),
   2856                                "{{ .reg .b16 \t%tmp_lo;\n\t"
   2857                                "  mov.b32 \t{%tmp_lo, $dst}, $src; }}",
   2858                                [(set Float16Regs:$dst,
   2859                                  (extractelt (v2f16 Float16x2Regs:$src), 1))]>;
   2860 
   2861   // Coalesce two f16 registers into f16x2
   2862   def BuildF16x2 : NVPTXInst<(outs Float16x2Regs:$dst),
   2863                              (ins Float16Regs:$a, Float16Regs:$b),
   2864                              "mov.b32 \t$dst, {{$a, $b}};",
   2865                              [(set Float16x2Regs:$dst,
   2866                                (build_vector (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>;
   2867 
   2868   // Directly initializing underlying the b32 register is one less SASS
   2869   // instruction than than vector-packing move.
   2870   def BuildF16x2i : NVPTXInst<(outs Float16x2Regs:$dst), (ins i32imm:$src),
   2871                               "mov.b32 \t$dst, $src;",
   2872                               []>;
   2873 
   2874   // Split f16x2 into two f16 registers.
   2875   def SplitF16x2  : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
   2876                               (ins Float16x2Regs:$src),
   2877                               "mov.b32 \t{{$lo, $hi}}, $src;",
   2878                               []>;
   2879   // Split an i32 into two f16
   2880   def SplitI32toF16x2  : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
   2881                                    (ins Int32Regs:$src),
   2882                                    "mov.b32 \t{{$lo, $hi}}, $src;",
   2883                                    []>;
   2884 }
   2885 
   2886 // Count leading zeros
   2887 let hasSideEffects = 0 in {
   2888   def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
   2889                          "clz.b32 \t$d, $a;", []>;
   2890   def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
   2891                          "clz.b64 \t$d, $a;", []>;
   2892 }
   2893 
   2894 // 32-bit has a direct PTX instruction
   2895 def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
   2896 
   2897 // The return type of the ctlz ISD node is the same as its input, but the PTX
   2898 // ctz instruction always returns a 32-bit value.  For ctlz.i64, convert the
   2899 // ptx value to 64 bits to match the ISD node's semantics, unless we know we're
   2900 // truncating back down to 32 bits.
   2901 def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
   2902 def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
   2903 
   2904 // For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
   2905 // result back to 16-bits if necessary.  We also need to subtract 16 because
   2906 // the high-order 16 zeros were counted.
   2907 //
   2908 // TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
   2909 // use to save one SASS instruction (on sm_35 anyway):
   2910 //
   2911 //   mov.b32 $tmp, {0xffff, $a}
   2912 //   ctlz.b32 $result, $tmp
   2913 //
   2914 // That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
   2915 // and then ctlz that value.  This way we don't have to subtract 16 from the
   2916 // result.  Unfortunately today we don't have a way to generate
   2917 // "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
   2918 def : Pat<(ctlz Int16Regs:$a),
   2919           (SUBi16ri (CVT_u16_u32
   2920            (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
   2921 def : Pat<(i32 (zext (ctlz Int16Regs:$a))),
   2922           (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
   2923 
   2924 // Population count
   2925 let hasSideEffects = 0 in {
   2926   def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
   2927                           "popc.b32 \t$d, $a;", []>;
   2928   def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
   2929                           "popc.b64 \t$d, $a;", []>;
   2930 }
   2931 
   2932 // 32-bit has a direct PTX instruction
   2933 def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>;
   2934 
   2935 // For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
   2936 // to match the LLVM semantics.  Just as with ctlz.i64, we provide a second
   2937 // pattern that avoids the type conversion if we're truncating the result to
   2938 // i32 anyway.
   2939 def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
   2940 def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>;
   2941 
   2942 // For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
   2943 // If we know that we're storing into an i32, we can avoid the final trunc.
   2944 def : Pat<(ctpop Int16Regs:$a),
   2945           (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
   2946 def : Pat<(i32 (zext (ctpop Int16Regs:$a))),
   2947           (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
   2948 
   2949 // fpround f32 -> f16
   2950 def : Pat<(f16 (fpround Float32Regs:$a)),
   2951           (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
   2952 def : Pat<(f16 (fpround Float32Regs:$a)),
   2953           (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
   2954 
   2955 // fpround f64 -> f16
   2956 def : Pat<(f16 (fpround Float64Regs:$a)),
   2957           (CVT_f16_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
   2958 def : Pat<(f16 (fpround Float64Regs:$a)),
   2959           (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
   2960 
   2961 // fpround f64 -> f32
   2962 def : Pat<(f32 (fpround Float64Regs:$a)),
   2963           (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
   2964 def : Pat<(f32 (fpround Float64Regs:$a)),
   2965           (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
   2966 
   2967 // fpextend f16 -> f32
   2968 def : Pat<(f32 (fpextend Float16Regs:$a)),
   2969           (CVT_f32_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
   2970 def : Pat<(f32 (fpextend Float16Regs:$a)),
   2971           (CVT_f32_f16 Float16Regs:$a, CvtNONE)>;
   2972 
   2973 // fpextend f16 -> f64
   2974 def : Pat<(f64 (fpextend Float16Regs:$a)),
   2975           (CVT_f64_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
   2976 def : Pat<(f64 (fpextend Float16Regs:$a)),
   2977           (CVT_f64_f16 Float16Regs:$a, CvtNONE)>;
   2978 
   2979 // fpextend f32 -> f64
   2980 def : Pat<(f64 (fpextend Float32Regs:$a)),
   2981           (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
   2982 def : Pat<(f64 (fpextend Float32Regs:$a)),
   2983           (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
   2984 
   2985 def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
   2986                      [SDNPHasChain, SDNPOptInGlue]>;
   2987 
   2988 // fceil, ffloor, fround, ftrunc.
   2989 
   2990 def : Pat<(fceil Float16Regs:$a),
   2991           (CVT_f16_f16 Float16Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>;
   2992 def : Pat<(fceil Float16Regs:$a),
   2993           (CVT_f16_f16 Float16Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>;
   2994 def : Pat<(fceil Float32Regs:$a),
   2995           (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>;
   2996 def : Pat<(fceil Float32Regs:$a),
   2997           (CVT_f32_f32 Float32Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>;
   2998 def : Pat<(fceil Float64Regs:$a),
   2999           (CVT_f64_f64 Float64Regs:$a, CvtRPI)>;
   3000 
   3001 def : Pat<(ffloor Float16Regs:$a),
   3002           (CVT_f16_f16 Float16Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>;
   3003 def : Pat<(ffloor Float16Regs:$a),
   3004           (CVT_f16_f16 Float16Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>;
   3005 def : Pat<(ffloor Float32Regs:$a),
   3006           (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>;
   3007 def : Pat<(ffloor Float32Regs:$a),
   3008           (CVT_f32_f32 Float32Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>;
   3009 def : Pat<(ffloor Float64Regs:$a),
   3010           (CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
   3011 
   3012 def : Pat<(fround Float16Regs:$a),
   3013           (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
   3014 def : Pat<(f16 (fround Float16Regs:$a)),
   3015           (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
   3016 def : Pat<(fround Float32Regs:$a),
   3017           (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
   3018 def : Pat<(f32 (fround Float32Regs:$a)),
   3019           (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
   3020 def : Pat<(f64 (fround Float64Regs:$a)),
   3021           (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
   3022 
   3023 def : Pat<(ftrunc Float16Regs:$a),
   3024           (CVT_f16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
   3025 def : Pat<(ftrunc Float16Regs:$a),
   3026           (CVT_f16_f16 Float16Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>;
   3027 def : Pat<(ftrunc Float32Regs:$a),
   3028           (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
   3029 def : Pat<(ftrunc Float32Regs:$a),
   3030           (CVT_f32_f32 Float32Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>;
   3031 def : Pat<(ftrunc Float64Regs:$a),
   3032           (CVT_f64_f64 Float64Regs:$a, CvtRZI)>;
   3033 
   3034 // nearbyint and rint are implemented as rounding to nearest even.  This isn't
   3035 // strictly correct, because it causes us to ignore the rounding mode.  But it
   3036 // matches what CUDA's "libm" does.
   3037 
   3038 def : Pat<(fnearbyint Float16Regs:$a),
   3039           (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
   3040 def : Pat<(fnearbyint Float16Regs:$a),
   3041           (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
   3042 def : Pat<(fnearbyint Float32Regs:$a),
   3043           (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
   3044 def : Pat<(fnearbyint Float32Regs:$a),
   3045           (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
   3046 def : Pat<(fnearbyint Float64Regs:$a),
   3047           (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
   3048 
   3049 def : Pat<(frint Float16Regs:$a),
   3050           (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
   3051 def : Pat<(frint Float16Regs:$a),
   3052           (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
   3053 def : Pat<(frint Float32Regs:$a),
   3054           (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
   3055 def : Pat<(frint Float32Regs:$a),
   3056           (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
   3057 def : Pat<(frint Float64Regs:$a),
   3058           (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
   3059 
   3060 
   3061 //-----------------------------------
   3062 // Control-flow
   3063 //-----------------------------------
   3064 
   3065 let isTerminator=1 in {
   3066    let isReturn=1, isBarrier=1 in
   3067       def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>;
   3068 
   3069    let isBranch=1 in
   3070       def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
   3071                               "@$a bra \t$target;",
   3072                               [(brcond Int1Regs:$a, bb:$target)]>;
   3073    let isBranch=1 in
   3074       def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
   3075                                    "@!$a bra \t$target;", []>;
   3076 
   3077    let isBranch=1, isBarrier=1 in
   3078       def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
   3079                            "bra.uni \t$target;", [(br bb:$target)]>;
   3080 }
   3081 
   3082 def : Pat<(brcond Int32Regs:$a, bb:$target),
   3083           (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
   3084 
   3085 // SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
   3086 // conditional branch if the target block is the next block so that the code
   3087 // can fall through to the target block.  The invertion is done by 'xor
   3088 // condition, 1', which will be translated to (setne condition, -1).  Since ptx
   3089 // supports '@!pred bra target', we should use it.
   3090 def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
   3091           (CBranchOther Int1Regs:$a, bb:$target)>;
   3092 
   3093 // Call
   3094 def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
   3095                                             SDTCisVT<1, i32>]>;
   3096 def SDT_NVPTXCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
   3097 
   3098 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
   3099                            [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
   3100 def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
   3101                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
   3102                             SDNPSideEffect]>;
   3103 
   3104 def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
   3105 def call          : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
   3106                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
   3107 def calltarget : Operand<i32>;
   3108 let isCall=1 in {
   3109    def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>;
   3110 }
   3111 
   3112 def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
   3113 def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
   3114 
   3115 // Pseudo instructions.
   3116 class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
   3117    : NVPTXInst<outs, ins, asmstr, pattern>;
   3118 
   3119 def Callseq_Start :
   3120   NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
   3121             "\\{ // callseq $amt1, $amt2\n"
   3122             "\t.reg .b32 temp_param_reg;",
   3123             [(callseq_start timm:$amt1, timm:$amt2)]>;
   3124 def Callseq_End :
   3125   NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
   3126             "\\} // callseq $amt1",
   3127             [(callseq_end timm:$amt1, timm:$amt2)]>;
   3128 
   3129 // trap instruction
   3130 def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>;
   3131 
   3132 // Call prototype wrapper
   3133 def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
   3134 def CallPrototype :
   3135   SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
   3136          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
   3137 def ProtoIdent : Operand<i32> {
   3138   let PrintMethod = "printProtoIdent";
   3139 }
   3140 def CALL_PROTOTYPE :
   3141   NVPTXInst<(outs), (ins ProtoIdent:$ident),
   3142             "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
   3143 
   3144 
   3145 include "NVPTXIntrinsics.td"
   3146 
   3147 
   3148 //-----------------------------------
   3149 // Notes
   3150 //-----------------------------------
   3151 // BSWAP is currently expanded. The following is a more efficient
   3152 // - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
   3153 // - for sm_20, use pmpt (use vector scalar mov to get the pack and
   3154 //   unpack). sm_20 supports native 32-bit register, but not native 16-bit
   3155 // register.
   3156