Home | History | Annotate | Download | only in R600
      1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 /// \brief Custom DAG lowering for R600
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "R600ISelLowering.h"
     16 #include "AMDGPUFrameLowering.h"
     17 #include "AMDGPUIntrinsicInfo.h"
     18 #include "AMDGPUSubtarget.h"
     19 #include "R600Defines.h"
     20 #include "R600InstrInfo.h"
     21 #include "R600MachineFunctionInfo.h"
     22 #include "llvm/Analysis/ValueTracking.h"
     23 #include "llvm/CodeGen/CallingConvLower.h"
     24 #include "llvm/CodeGen/MachineFrameInfo.h"
     25 #include "llvm/CodeGen/MachineInstrBuilder.h"
     26 #include "llvm/CodeGen/MachineRegisterInfo.h"
     27 #include "llvm/CodeGen/SelectionDAG.h"
     28 #include "llvm/IR/Argument.h"
     29 #include "llvm/IR/Function.h"
     30 
     31 using namespace llvm;
     32 
     33 R600TargetLowering::R600TargetLowering(TargetMachine &TM,
     34                                        const AMDGPUSubtarget &STI)
     35     : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
     36   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
     37   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
     38   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
     39   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
     40   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
     41   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
     42 
     43   computeRegisterProperties(STI.getRegisterInfo());
     44 
     45   // Set condition code actions
     46   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
     47   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
     48   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
     49   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
     50   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
     51   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
     52   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
     53   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
     54   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
     55   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
     56   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
     57   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
     58 
     59   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
     60   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
     61   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
     62   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
     63 
     64   setOperationAction(ISD::FCOS, MVT::f32, Custom);
     65   setOperationAction(ISD::FSIN, MVT::f32, Custom);
     66 
     67   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
     68   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
     69 
     70   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
     71   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
     72   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
     73 
     74   setOperationAction(ISD::FSUB, MVT::f32, Expand);
     75 
     76   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
     77   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
     78   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
     79 
     80   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
     81   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
     82 
     83   setOperationAction(ISD::SETCC, MVT::i32, Expand);
     84   setOperationAction(ISD::SETCC, MVT::f32, Expand);
     85   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
     86   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
     87   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
     88 
     89   setOperationAction(ISD::SELECT, MVT::i32, Expand);
     90   setOperationAction(ISD::SELECT, MVT::f32, Expand);
     91   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
     92   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
     93 
     94   // Expand sign extension of vectors
     95   if (!Subtarget->hasBFE())
     96     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
     97 
     98   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
     99   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
    100 
    101   if (!Subtarget->hasBFE())
    102     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
    103   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
    104   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
    105 
    106   if (!Subtarget->hasBFE())
    107     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
    108   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
    109   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
    110 
    111   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
    112   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
    113   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
    114 
    115   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
    116 
    117 
    118   // Legalize loads and stores to the private address space.
    119   setOperationAction(ISD::LOAD, MVT::i32, Custom);
    120   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
    121   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
    122 
    123   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
    124   // spaces, so it is custom lowered to handle those where it isn't.
    125   for (MVT VT : MVT::integer_valuetypes()) {
    126     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
    127     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
    128     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
    129 
    130     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
    131     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
    132     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
    133 
    134     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
    135     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
    136     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
    137   }
    138 
    139   setOperationAction(ISD::STORE, MVT::i8, Custom);
    140   setOperationAction(ISD::STORE, MVT::i32, Custom);
    141   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
    142   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
    143   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
    144   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
    145 
    146   setOperationAction(ISD::LOAD, MVT::i32, Custom);
    147   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
    148   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
    149 
    150   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
    151   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
    152   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
    153   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
    154 
    155   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
    156   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
    157   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
    158   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
    159 
    160   setTargetDAGCombine(ISD::FP_ROUND);
    161   setTargetDAGCombine(ISD::FP_TO_SINT);
    162   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
    163   setTargetDAGCombine(ISD::SELECT_CC);
    164   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
    165 
    166   setOperationAction(ISD::SUB, MVT::i64, Expand);
    167 
    168   // These should be replaced by UDVIREM, but it does not happen automatically
    169   // during Type Legalization
    170   setOperationAction(ISD::UDIV, MVT::i64, Custom);
    171   setOperationAction(ISD::UREM, MVT::i64, Custom);
    172   setOperationAction(ISD::SDIV, MVT::i64, Custom);
    173   setOperationAction(ISD::SREM, MVT::i64, Custom);
    174 
    175   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
    176   //  to be Legal/Custom in order to avoid library calls.
    177   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
    178   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
    179   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
    180 
    181   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
    182 
    183   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
    184   for (MVT VT : ScalarIntVTs) {
    185     setOperationAction(ISD::ADDC, VT, Expand);
    186     setOperationAction(ISD::SUBC, VT, Expand);
    187     setOperationAction(ISD::ADDE, VT, Expand);
    188     setOperationAction(ISD::SUBE, VT, Expand);
    189   }
    190 
    191   setSchedulingPreference(Sched::Source);
    192 }
    193 
    194 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
    195     MachineInstr * MI, MachineBasicBlock * BB) const {
    196   MachineFunction * MF = BB->getParent();
    197   MachineRegisterInfo &MRI = MF->getRegInfo();
    198   MachineBasicBlock::iterator I = *MI;
    199   const R600InstrInfo *TII =
    200       static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
    201 
    202   switch (MI->getOpcode()) {
    203   default:
    204     // Replace LDS_*_RET instruction that don't have any uses with the
    205     // equivalent LDS_*_NORET instruction.
    206     if (TII->isLDSRetInstr(MI->getOpcode())) {
    207       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
    208       assert(DstIdx != -1);
    209       MachineInstrBuilder NewMI;
    210       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
    211       //        LDS_1A2D support and remove this special case.
    212       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
    213            MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
    214         return BB;
    215 
    216       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
    217                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
    218       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
    219         NewMI.addOperand(MI->getOperand(i));
    220       }
    221     } else {
    222       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
    223     }
    224     break;
    225   case AMDGPU::CLAMP_R600: {
    226     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
    227                                                    AMDGPU::MOV,
    228                                                    MI->getOperand(0).getReg(),
    229                                                    MI->getOperand(1).getReg());
    230     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
    231     break;
    232   }
    233 
    234   case AMDGPU::FABS_R600: {
    235     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
    236                                                     AMDGPU::MOV,
    237                                                     MI->getOperand(0).getReg(),
    238                                                     MI->getOperand(1).getReg());
    239     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
    240     break;
    241   }
    242 
    243   case AMDGPU::FNEG_R600: {
    244     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
    245                                                     AMDGPU::MOV,
    246                                                     MI->getOperand(0).getReg(),
    247                                                     MI->getOperand(1).getReg());
    248     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
    249     break;
    250   }
    251 
    252   case AMDGPU::MASK_WRITE: {
    253     unsigned maskedRegister = MI->getOperand(0).getReg();
    254     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
    255     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
    256     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
    257     break;
    258   }
    259 
    260   case AMDGPU::MOV_IMM_F32:
    261     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
    262                      MI->getOperand(1).getFPImm()->getValueAPF()
    263                          .bitcastToAPInt().getZExtValue());
    264     break;
    265   case AMDGPU::MOV_IMM_I32:
    266     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
    267                      MI->getOperand(1).getImm());
    268     break;
    269   case AMDGPU::CONST_COPY: {
    270     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
    271         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
    272     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
    273         MI->getOperand(1).getImm());
    274     break;
    275   }
    276 
    277   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
    278   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
    279   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
    280     unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
    281 
    282     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
    283             .addOperand(MI->getOperand(0))
    284             .addOperand(MI->getOperand(1))
    285             .addImm(EOP); // Set End of program bit
    286     break;
    287   }
    288 
    289   case AMDGPU::TXD: {
    290     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
    291     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
    292     MachineOperand &RID = MI->getOperand(4);
    293     MachineOperand &SID = MI->getOperand(5);
    294     unsigned TextureId = MI->getOperand(6).getImm();
    295     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
    296     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
    297 
    298     switch (TextureId) {
    299     case 5: // Rect
    300       CTX = CTY = 0;
    301       break;
    302     case 6: // Shadow1D
    303       SrcW = SrcZ;
    304       break;
    305     case 7: // Shadow2D
    306       SrcW = SrcZ;
    307       break;
    308     case 8: // ShadowRect
    309       CTX = CTY = 0;
    310       SrcW = SrcZ;
    311       break;
    312     case 9: // 1DArray
    313       SrcZ = SrcY;
    314       CTZ = 0;
    315       break;
    316     case 10: // 2DArray
    317       CTZ = 0;
    318       break;
    319     case 11: // Shadow1DArray
    320       SrcZ = SrcY;
    321       CTZ = 0;
    322       break;
    323     case 12: // Shadow2DArray
    324       CTZ = 0;
    325       break;
    326     }
    327     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
    328             .addOperand(MI->getOperand(3))
    329             .addImm(SrcX)
    330             .addImm(SrcY)
    331             .addImm(SrcZ)
    332             .addImm(SrcW)
    333             .addImm(0)
    334             .addImm(0)
    335             .addImm(0)
    336             .addImm(0)
    337             .addImm(1)
    338             .addImm(2)
    339             .addImm(3)
    340             .addOperand(RID)
    341             .addOperand(SID)
    342             .addImm(CTX)
    343             .addImm(CTY)
    344             .addImm(CTZ)
    345             .addImm(CTW);
    346     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
    347             .addOperand(MI->getOperand(2))
    348             .addImm(SrcX)
    349             .addImm(SrcY)
    350             .addImm(SrcZ)
    351             .addImm(SrcW)
    352             .addImm(0)
    353             .addImm(0)
    354             .addImm(0)
    355             .addImm(0)
    356             .addImm(1)
    357             .addImm(2)
    358             .addImm(3)
    359             .addOperand(RID)
    360             .addOperand(SID)
    361             .addImm(CTX)
    362             .addImm(CTY)
    363             .addImm(CTZ)
    364             .addImm(CTW);
    365     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
    366             .addOperand(MI->getOperand(0))
    367             .addOperand(MI->getOperand(1))
    368             .addImm(SrcX)
    369             .addImm(SrcY)
    370             .addImm(SrcZ)
    371             .addImm(SrcW)
    372             .addImm(0)
    373             .addImm(0)
    374             .addImm(0)
    375             .addImm(0)
    376             .addImm(1)
    377             .addImm(2)
    378             .addImm(3)
    379             .addOperand(RID)
    380             .addOperand(SID)
    381             .addImm(CTX)
    382             .addImm(CTY)
    383             .addImm(CTZ)
    384             .addImm(CTW)
    385             .addReg(T0, RegState::Implicit)
    386             .addReg(T1, RegState::Implicit);
    387     break;
    388   }
    389 
    390   case AMDGPU::TXD_SHADOW: {
    391     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
    392     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
    393     MachineOperand &RID = MI->getOperand(4);
    394     MachineOperand &SID = MI->getOperand(5);
    395     unsigned TextureId = MI->getOperand(6).getImm();
    396     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
    397     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
    398 
    399     switch (TextureId) {
    400     case 5: // Rect
    401       CTX = CTY = 0;
    402       break;
    403     case 6: // Shadow1D
    404       SrcW = SrcZ;
    405       break;
    406     case 7: // Shadow2D
    407       SrcW = SrcZ;
    408       break;
    409     case 8: // ShadowRect
    410       CTX = CTY = 0;
    411       SrcW = SrcZ;
    412       break;
    413     case 9: // 1DArray
    414       SrcZ = SrcY;
    415       CTZ = 0;
    416       break;
    417     case 10: // 2DArray
    418       CTZ = 0;
    419       break;
    420     case 11: // Shadow1DArray
    421       SrcZ = SrcY;
    422       CTZ = 0;
    423       break;
    424     case 12: // Shadow2DArray
    425       CTZ = 0;
    426       break;
    427     }
    428 
    429     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
    430             .addOperand(MI->getOperand(3))
    431             .addImm(SrcX)
    432             .addImm(SrcY)
    433             .addImm(SrcZ)
    434             .addImm(SrcW)
    435             .addImm(0)
    436             .addImm(0)
    437             .addImm(0)
    438             .addImm(0)
    439             .addImm(1)
    440             .addImm(2)
    441             .addImm(3)
    442             .addOperand(RID)
    443             .addOperand(SID)
    444             .addImm(CTX)
    445             .addImm(CTY)
    446             .addImm(CTZ)
    447             .addImm(CTW);
    448     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
    449             .addOperand(MI->getOperand(2))
    450             .addImm(SrcX)
    451             .addImm(SrcY)
    452             .addImm(SrcZ)
    453             .addImm(SrcW)
    454             .addImm(0)
    455             .addImm(0)
    456             .addImm(0)
    457             .addImm(0)
    458             .addImm(1)
    459             .addImm(2)
    460             .addImm(3)
    461             .addOperand(RID)
    462             .addOperand(SID)
    463             .addImm(CTX)
    464             .addImm(CTY)
    465             .addImm(CTZ)
    466             .addImm(CTW);
    467     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
    468             .addOperand(MI->getOperand(0))
    469             .addOperand(MI->getOperand(1))
    470             .addImm(SrcX)
    471             .addImm(SrcY)
    472             .addImm(SrcZ)
    473             .addImm(SrcW)
    474             .addImm(0)
    475             .addImm(0)
    476             .addImm(0)
    477             .addImm(0)
    478             .addImm(1)
    479             .addImm(2)
    480             .addImm(3)
    481             .addOperand(RID)
    482             .addOperand(SID)
    483             .addImm(CTX)
    484             .addImm(CTY)
    485             .addImm(CTZ)
    486             .addImm(CTW)
    487             .addReg(T0, RegState::Implicit)
    488             .addReg(T1, RegState::Implicit);
    489     break;
    490   }
    491 
    492   case AMDGPU::BRANCH:
    493       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
    494               .addOperand(MI->getOperand(0));
    495       break;
    496 
    497   case AMDGPU::BRANCH_COND_f32: {
    498     MachineInstr *NewMI =
    499       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
    500               AMDGPU::PREDICATE_BIT)
    501               .addOperand(MI->getOperand(1))
    502               .addImm(OPCODE_IS_NOT_ZERO)
    503               .addImm(0); // Flags
    504     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
    505     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
    506             .addOperand(MI->getOperand(0))
    507             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
    508     break;
    509   }
    510 
    511   case AMDGPU::BRANCH_COND_i32: {
    512     MachineInstr *NewMI =
    513       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
    514             AMDGPU::PREDICATE_BIT)
    515             .addOperand(MI->getOperand(1))
    516             .addImm(OPCODE_IS_NOT_ZERO_INT)
    517             .addImm(0); // Flags
    518     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
    519     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
    520            .addOperand(MI->getOperand(0))
    521             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
    522     break;
    523   }
    524 
    525   case AMDGPU::EG_ExportSwz:
    526   case AMDGPU::R600_ExportSwz: {
    527     // Instruction is left unmodified if its not the last one of its type
    528     bool isLastInstructionOfItsType = true;
    529     unsigned InstExportType = MI->getOperand(1).getImm();
    530     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
    531          EndBlock = BB->end(); NextExportInst != EndBlock;
    532          NextExportInst = std::next(NextExportInst)) {
    533       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
    534           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
    535         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
    536             .getImm();
    537         if (CurrentInstExportType == InstExportType) {
    538           isLastInstructionOfItsType = false;
    539           break;
    540         }
    541       }
    542     }
    543     bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
    544     if (!EOP && !isLastInstructionOfItsType)
    545       return BB;
    546     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
    547     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
    548             .addOperand(MI->getOperand(0))
    549             .addOperand(MI->getOperand(1))
    550             .addOperand(MI->getOperand(2))
    551             .addOperand(MI->getOperand(3))
    552             .addOperand(MI->getOperand(4))
    553             .addOperand(MI->getOperand(5))
    554             .addOperand(MI->getOperand(6))
    555             .addImm(CfInst)
    556             .addImm(EOP);
    557     break;
    558   }
    559   case AMDGPU::RETURN: {
    560     // RETURN instructions must have the live-out registers as implicit uses,
    561     // otherwise they appear dead.
    562     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
    563     MachineInstrBuilder MIB(*MF, MI);
    564     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
    565       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
    566     return BB;
    567   }
    568   }
    569 
    570   MI->eraseFromParent();
    571   return BB;
    572 }
    573 
    574 //===----------------------------------------------------------------------===//
    575 // Custom DAG Lowering Operations
    576 //===----------------------------------------------------------------------===//
    577 
    578 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    579   MachineFunction &MF = DAG.getMachineFunction();
    580   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
    581   switch (Op.getOpcode()) {
    582   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
    583   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
    584   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
    585   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
    586   case ISD::SRA_PARTS:
    587   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
    588   case ISD::FCOS:
    589   case ISD::FSIN: return LowerTrig(Op, DAG);
    590   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
    591   case ISD::STORE: return LowerSTORE(Op, DAG);
    592   case ISD::LOAD: {
    593     SDValue Result = LowerLOAD(Op, DAG);
    594     assert((!Result.getNode() ||
    595             Result.getNode()->getNumValues() == 2) &&
    596            "Load should return a value and a chain");
    597     return Result;
    598   }
    599 
    600   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
    601   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
    602   case ISD::INTRINSIC_VOID: {
    603     SDValue Chain = Op.getOperand(0);
    604     unsigned IntrinsicID =
    605                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    606     switch (IntrinsicID) {
    607     case AMDGPUIntrinsic::AMDGPU_store_output: {
    608       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
    609       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
    610       MFI->LiveOuts.push_back(Reg);
    611       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
    612     }
    613     case AMDGPUIntrinsic::R600_store_swizzle: {
    614       const SDValue Args[8] = {
    615         Chain,
    616         Op.getOperand(2), // Export Value
    617         Op.getOperand(3), // ArrayBase
    618         Op.getOperand(4), // Type
    619         DAG.getConstant(0, MVT::i32), // SWZ_X
    620         DAG.getConstant(1, MVT::i32), // SWZ_Y
    621         DAG.getConstant(2, MVT::i32), // SWZ_Z
    622         DAG.getConstant(3, MVT::i32) // SWZ_W
    623       };
    624       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
    625     }
    626 
    627     // default for switch(IntrinsicID)
    628     default: break;
    629     }
    630     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
    631     break;
    632   }
    633   case ISD::INTRINSIC_WO_CHAIN: {
    634     unsigned IntrinsicID =
    635                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    636     EVT VT = Op.getValueType();
    637     SDLoc DL(Op);
    638     switch(IntrinsicID) {
    639     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
    640     case AMDGPUIntrinsic::R600_load_input: {
    641       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    642       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
    643       MachineFunction &MF = DAG.getMachineFunction();
    644       MachineRegisterInfo &MRI = MF.getRegInfo();
    645       MRI.addLiveIn(Reg);
    646       return DAG.getCopyFromReg(DAG.getEntryNode(),
    647           SDLoc(DAG.getEntryNode()), Reg, VT);
    648     }
    649 
    650     case AMDGPUIntrinsic::R600_interp_input: {
    651       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    652       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
    653       MachineSDNode *interp;
    654       if (ijb < 0) {
    655         const R600InstrInfo *TII =
    656             static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
    657         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
    658             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
    659         return DAG.getTargetExtractSubreg(
    660             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
    661             DL, MVT::f32, SDValue(interp, 0));
    662       }
    663       MachineFunction &MF = DAG.getMachineFunction();
    664       MachineRegisterInfo &MRI = MF.getRegInfo();
    665       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
    666       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
    667       MRI.addLiveIn(RegisterI);
    668       MRI.addLiveIn(RegisterJ);
    669       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
    670           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
    671       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
    672           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
    673 
    674       if (slot % 4 < 2)
    675         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
    676             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
    677             RegisterJNode, RegisterINode);
    678       else
    679         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
    680             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
    681             RegisterJNode, RegisterINode);
    682       return SDValue(interp, slot % 2);
    683     }
    684     case AMDGPUIntrinsic::R600_interp_xy:
    685     case AMDGPUIntrinsic::R600_interp_zw: {
    686       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    687       MachineSDNode *interp;
    688       SDValue RegisterINode = Op.getOperand(2);
    689       SDValue RegisterJNode = Op.getOperand(3);
    690 
    691       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
    692         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
    693             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
    694             RegisterJNode, RegisterINode);
    695       else
    696         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
    697             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
    698             RegisterJNode, RegisterINode);
    699       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
    700           SDValue(interp, 0), SDValue(interp, 1));
    701     }
    702     case AMDGPUIntrinsic::R600_tex:
    703     case AMDGPUIntrinsic::R600_texc:
    704     case AMDGPUIntrinsic::R600_txl:
    705     case AMDGPUIntrinsic::R600_txlc:
    706     case AMDGPUIntrinsic::R600_txb:
    707     case AMDGPUIntrinsic::R600_txbc:
    708     case AMDGPUIntrinsic::R600_txf:
    709     case AMDGPUIntrinsic::R600_txq:
    710     case AMDGPUIntrinsic::R600_ddx:
    711     case AMDGPUIntrinsic::R600_ddy:
    712     case AMDGPUIntrinsic::R600_ldptr: {
    713       unsigned TextureOp;
    714       switch (IntrinsicID) {
    715       case AMDGPUIntrinsic::R600_tex:
    716         TextureOp = 0;
    717         break;
    718       case AMDGPUIntrinsic::R600_texc:
    719         TextureOp = 1;
    720         break;
    721       case AMDGPUIntrinsic::R600_txl:
    722         TextureOp = 2;
    723         break;
    724       case AMDGPUIntrinsic::R600_txlc:
    725         TextureOp = 3;
    726         break;
    727       case AMDGPUIntrinsic::R600_txb:
    728         TextureOp = 4;
    729         break;
    730       case AMDGPUIntrinsic::R600_txbc:
    731         TextureOp = 5;
    732         break;
    733       case AMDGPUIntrinsic::R600_txf:
    734         TextureOp = 6;
    735         break;
    736       case AMDGPUIntrinsic::R600_txq:
    737         TextureOp = 7;
    738         break;
    739       case AMDGPUIntrinsic::R600_ddx:
    740         TextureOp = 8;
    741         break;
    742       case AMDGPUIntrinsic::R600_ddy:
    743         TextureOp = 9;
    744         break;
    745       case AMDGPUIntrinsic::R600_ldptr:
    746         TextureOp = 10;
    747         break;
    748       default:
    749         llvm_unreachable("Unknow Texture Operation");
    750       }
    751 
    752       SDValue TexArgs[19] = {
    753         DAG.getConstant(TextureOp, MVT::i32),
    754         Op.getOperand(1),
    755         DAG.getConstant(0, MVT::i32),
    756         DAG.getConstant(1, MVT::i32),
    757         DAG.getConstant(2, MVT::i32),
    758         DAG.getConstant(3, MVT::i32),
    759         Op.getOperand(2),
    760         Op.getOperand(3),
    761         Op.getOperand(4),
    762         DAG.getConstant(0, MVT::i32),
    763         DAG.getConstant(1, MVT::i32),
    764         DAG.getConstant(2, MVT::i32),
    765         DAG.getConstant(3, MVT::i32),
    766         Op.getOperand(5),
    767         Op.getOperand(6),
    768         Op.getOperand(7),
    769         Op.getOperand(8),
    770         Op.getOperand(9),
    771         Op.getOperand(10)
    772       };
    773       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
    774     }
    775     case AMDGPUIntrinsic::AMDGPU_dp4: {
    776       SDValue Args[8] = {
    777       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
    778           DAG.getConstant(0, MVT::i32)),
    779       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
    780           DAG.getConstant(0, MVT::i32)),
    781       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
    782           DAG.getConstant(1, MVT::i32)),
    783       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
    784           DAG.getConstant(1, MVT::i32)),
    785       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
    786           DAG.getConstant(2, MVT::i32)),
    787       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
    788           DAG.getConstant(2, MVT::i32)),
    789       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
    790           DAG.getConstant(3, MVT::i32)),
    791       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
    792           DAG.getConstant(3, MVT::i32))
    793       };
    794       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
    795     }
    796 
    797     case Intrinsic::r600_read_ngroups_x:
    798       return LowerImplicitParameter(DAG, VT, DL, 0);
    799     case Intrinsic::r600_read_ngroups_y:
    800       return LowerImplicitParameter(DAG, VT, DL, 1);
    801     case Intrinsic::r600_read_ngroups_z:
    802       return LowerImplicitParameter(DAG, VT, DL, 2);
    803     case Intrinsic::r600_read_global_size_x:
    804       return LowerImplicitParameter(DAG, VT, DL, 3);
    805     case Intrinsic::r600_read_global_size_y:
    806       return LowerImplicitParameter(DAG, VT, DL, 4);
    807     case Intrinsic::r600_read_global_size_z:
    808       return LowerImplicitParameter(DAG, VT, DL, 5);
    809     case Intrinsic::r600_read_local_size_x:
    810       return LowerImplicitParameter(DAG, VT, DL, 6);
    811     case Intrinsic::r600_read_local_size_y:
    812       return LowerImplicitParameter(DAG, VT, DL, 7);
    813     case Intrinsic::r600_read_local_size_z:
    814       return LowerImplicitParameter(DAG, VT, DL, 8);
    815 
    816     case Intrinsic::AMDGPU_read_workdim:
    817       return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4);
    818 
    819     case Intrinsic::r600_read_tgid_x:
    820       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    821                                   AMDGPU::T1_X, VT);
    822     case Intrinsic::r600_read_tgid_y:
    823       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    824                                   AMDGPU::T1_Y, VT);
    825     case Intrinsic::r600_read_tgid_z:
    826       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    827                                   AMDGPU::T1_Z, VT);
    828     case Intrinsic::r600_read_tidig_x:
    829       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    830                                   AMDGPU::T0_X, VT);
    831     case Intrinsic::r600_read_tidig_y:
    832       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    833                                   AMDGPU::T0_Y, VT);
    834     case Intrinsic::r600_read_tidig_z:
    835       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    836                                   AMDGPU::T0_Z, VT);
    837     case Intrinsic::AMDGPU_rsq:
    838       // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
    839       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
    840 
    841     case AMDGPUIntrinsic::AMDGPU_fract:
    842     case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
    843       return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
    844     }
    845     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
    846     break;
    847   }
    848   } // end switch(Op.getOpcode())
    849   return SDValue();
    850 }
    851 
    852 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
    853                                             SmallVectorImpl<SDValue> &Results,
    854                                             SelectionDAG &DAG) const {
    855   switch (N->getOpcode()) {
    856   default:
    857     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
    858     return;
    859   case ISD::FP_TO_UINT:
    860     if (N->getValueType(0) == MVT::i1) {
    861       Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
    862       return;
    863     }
    864     // Fall-through. Since we don't care about out of bounds values
    865     // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
    866     // considers some extra cases which are not necessary here.
    867   case ISD::FP_TO_SINT: {
    868     SDValue Result;
    869     if (expandFP_TO_SINT(N, Result, DAG))
    870       Results.push_back(Result);
    871     return;
    872   }
    873   case ISD::UDIV: {
    874     SDValue Op = SDValue(N, 0);
    875     SDLoc DL(Op);
    876     EVT VT = Op.getValueType();
    877     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
    878       N->getOperand(0), N->getOperand(1));
    879     Results.push_back(UDIVREM);
    880     break;
    881   }
    882   case ISD::UREM: {
    883     SDValue Op = SDValue(N, 0);
    884     SDLoc DL(Op);
    885     EVT VT = Op.getValueType();
    886     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
    887       N->getOperand(0), N->getOperand(1));
    888     Results.push_back(UDIVREM.getValue(1));
    889     break;
    890   }
    891   case ISD::SDIV: {
    892     SDValue Op = SDValue(N, 0);
    893     SDLoc DL(Op);
    894     EVT VT = Op.getValueType();
    895     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
    896       N->getOperand(0), N->getOperand(1));
    897     Results.push_back(SDIVREM);
    898     break;
    899   }
    900   case ISD::SREM: {
    901     SDValue Op = SDValue(N, 0);
    902     SDLoc DL(Op);
    903     EVT VT = Op.getValueType();
    904     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
    905       N->getOperand(0), N->getOperand(1));
    906     Results.push_back(SDIVREM.getValue(1));
    907     break;
    908   }
    909   case ISD::SDIVREM: {
    910     SDValue Op = SDValue(N, 1);
    911     SDValue RES = LowerSDIVREM(Op, DAG);
    912     Results.push_back(RES);
    913     Results.push_back(RES.getValue(1));
    914     break;
    915   }
    916   case ISD::UDIVREM: {
    917     SDValue Op = SDValue(N, 0);
    918     LowerUDIVREM64(Op, DAG, Results);
    919     break;
    920   }
    921   }
    922 }
    923 
    924 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
    925                                                    SDValue Vector) const {
    926 
    927   SDLoc DL(Vector);
    928   EVT VecVT = Vector.getValueType();
    929   EVT EltVT = VecVT.getVectorElementType();
    930   SmallVector<SDValue, 8> Args;
    931 
    932   for (unsigned i = 0, e = VecVT.getVectorNumElements();
    933                                                            i != e; ++i) {
    934     Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
    935                                Vector, DAG.getConstant(i, getVectorIdxTy())));
    936   }
    937 
    938   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
    939 }
    940 
    941 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
    942                                                     SelectionDAG &DAG) const {
    943 
    944   SDLoc DL(Op);
    945   SDValue Vector = Op.getOperand(0);
    946   SDValue Index = Op.getOperand(1);
    947 
    948   if (isa<ConstantSDNode>(Index) ||
    949       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
    950     return Op;
    951 
    952   Vector = vectorToVerticalVector(DAG, Vector);
    953   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
    954                      Vector, Index);
    955 }
    956 
    957 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
    958                                                    SelectionDAG &DAG) const {
    959   SDLoc DL(Op);
    960   SDValue Vector = Op.getOperand(0);
    961   SDValue Value = Op.getOperand(1);
    962   SDValue Index = Op.getOperand(2);
    963 
    964   if (isa<ConstantSDNode>(Index) ||
    965       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
    966     return Op;
    967 
    968   Vector = vectorToVerticalVector(DAG, Vector);
    969   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
    970                                Vector, Value, Index);
    971   return vectorToVerticalVector(DAG, Insert);
    972 }
    973 
    974 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
    975   // On hw >= R700, COS/SIN input must be between -1. and 1.
    976   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
    977   EVT VT = Op.getValueType();
    978   SDValue Arg = Op.getOperand(0);
    979   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
    980       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
    981         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
    982           DAG.getConstantFP(0.15915494309, MVT::f32)),
    983         DAG.getConstantFP(0.5, MVT::f32)));
    984   unsigned TrigNode;
    985   switch (Op.getOpcode()) {
    986   case ISD::FCOS:
    987     TrigNode = AMDGPUISD::COS_HW;
    988     break;
    989   case ISD::FSIN:
    990     TrigNode = AMDGPUISD::SIN_HW;
    991     break;
    992   default:
    993     llvm_unreachable("Wrong trig opcode");
    994   }
    995   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
    996       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
    997         DAG.getConstantFP(-0.5, MVT::f32)));
    998   if (Gen >= AMDGPUSubtarget::R700)
    999     return TrigVal;
   1000   // On R600 hw, COS/SIN input must be between -Pi and Pi.
   1001   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
   1002       DAG.getConstantFP(3.14159265359, MVT::f32));
   1003 }
   1004 
   1005 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
   1006   SDLoc DL(Op);
   1007   EVT VT = Op.getValueType();
   1008 
   1009   SDValue Lo = Op.getOperand(0);
   1010   SDValue Hi = Op.getOperand(1);
   1011   SDValue Shift = Op.getOperand(2);
   1012   SDValue Zero = DAG.getConstant(0, VT);
   1013   SDValue One  = DAG.getConstant(1, VT);
   1014 
   1015   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
   1016   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
   1017   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
   1018   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
   1019 
   1020   // The dance around Width1 is necessary for 0 special case.
   1021   // Without it the CompShift might be 32, producing incorrect results in
   1022   // Overflow. So we do the shift in two steps, the alternative is to
   1023   // add a conditional to filter the special case.
   1024 
   1025   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
   1026   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
   1027 
   1028   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
   1029   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
   1030   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
   1031 
   1032   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
   1033   SDValue LoBig = Zero;
   1034 
   1035   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
   1036   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
   1037 
   1038   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
   1039 }
   1040 
   1041 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
   1042   SDLoc DL(Op);
   1043   EVT VT = Op.getValueType();
   1044 
   1045   SDValue Lo = Op.getOperand(0);
   1046   SDValue Hi = Op.getOperand(1);
   1047   SDValue Shift = Op.getOperand(2);
   1048   SDValue Zero = DAG.getConstant(0, VT);
   1049   SDValue One  = DAG.getConstant(1, VT);
   1050 
   1051   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
   1052 
   1053   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
   1054   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
   1055   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
   1056   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
   1057 
   1058   // The dance around Width1 is necessary for 0 special case.
   1059   // Without it the CompShift might be 32, producing incorrect results in
   1060   // Overflow. So we do the shift in two steps, the alternative is to
   1061   // add a conditional to filter the special case.
   1062 
   1063   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
   1064   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
   1065 
   1066   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
   1067   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
   1068   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
   1069 
   1070   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
   1071   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
   1072 
   1073   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
   1074   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
   1075 
   1076   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
   1077 }
   1078 
   1079 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
   1080   return DAG.getNode(
   1081       ISD::SETCC,
   1082       SDLoc(Op),
   1083       MVT::i1,
   1084       Op, DAG.getConstantFP(0.0f, MVT::f32),
   1085       DAG.getCondCode(ISD::SETNE)
   1086       );
   1087 }
   1088 
   1089 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
   1090                                                    SDLoc DL,
   1091                                                    unsigned DwordOffset) const {
   1092   unsigned ByteOffset = DwordOffset * 4;
   1093   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
   1094                                       AMDGPUAS::CONSTANT_BUFFER_0);
   1095 
   1096   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
   1097   assert(isInt<16>(ByteOffset));
   1098 
   1099   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
   1100                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
   1101                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
   1102                      false, false, false, 0);
   1103 }
   1104 
   1105 bool R600TargetLowering::isZero(SDValue Op) const {
   1106   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
   1107     return Cst->isNullValue();
   1108   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
   1109     return CstFP->isZero();
   1110   } else {
   1111     return false;
   1112   }
   1113 }
   1114 
   1115 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   1116   SDLoc DL(Op);
   1117   EVT VT = Op.getValueType();
   1118 
   1119   SDValue LHS = Op.getOperand(0);
   1120   SDValue RHS = Op.getOperand(1);
   1121   SDValue True = Op.getOperand(2);
   1122   SDValue False = Op.getOperand(3);
   1123   SDValue CC = Op.getOperand(4);
   1124   SDValue Temp;
   1125 
   1126   if (VT == MVT::f32) {
   1127     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
   1128     SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
   1129     if (MinMax)
   1130       return MinMax;
   1131   }
   1132 
   1133   // LHS and RHS are guaranteed to be the same value type
   1134   EVT CompareVT = LHS.getValueType();
   1135 
   1136   // Check if we can lower this to a native operation.
   1137 
   1138   // Try to lower to a SET* instruction:
   1139   //
   1140   // SET* can match the following patterns:
   1141   //
   1142   // select_cc f32, f32, -1,  0, cc_supported
   1143   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
   1144   // select_cc i32, i32, -1,  0, cc_supported
   1145   //
   1146 
   1147   // Move hardware True/False values to the correct operand.
   1148   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
   1149   ISD::CondCode InverseCC =
   1150      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
   1151   if (isHWTrueValue(False) && isHWFalseValue(True)) {
   1152     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
   1153       std::swap(False, True);
   1154       CC = DAG.getCondCode(InverseCC);
   1155     } else {
   1156       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
   1157       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
   1158         std::swap(False, True);
   1159         std::swap(LHS, RHS);
   1160         CC = DAG.getCondCode(SwapInvCC);
   1161       }
   1162     }
   1163   }
   1164 
   1165   if (isHWTrueValue(True) && isHWFalseValue(False) &&
   1166       (CompareVT == VT || VT == MVT::i32)) {
   1167     // This can be matched by a SET* instruction.
   1168     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
   1169   }
   1170 
   1171   // Try to lower to a CND* instruction:
   1172   //
   1173   // CND* can match the following patterns:
   1174   //
   1175   // select_cc f32, 0.0, f32, f32, cc_supported
   1176   // select_cc f32, 0.0, i32, i32, cc_supported
   1177   // select_cc i32, 0,   f32, f32, cc_supported
   1178   // select_cc i32, 0,   i32, i32, cc_supported
   1179   //
   1180 
   1181   // Try to move the zero value to the RHS
   1182   if (isZero(LHS)) {
   1183     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
   1184     // Try swapping the operands
   1185     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
   1186     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
   1187       std::swap(LHS, RHS);
   1188       CC = DAG.getCondCode(CCSwapped);
   1189     } else {
   1190       // Try inverting the conditon and then swapping the operands
   1191       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
   1192       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
   1193       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
   1194         std::swap(True, False);
   1195         std::swap(LHS, RHS);
   1196         CC = DAG.getCondCode(CCSwapped);
   1197       }
   1198     }
   1199   }
   1200   if (isZero(RHS)) {
   1201     SDValue Cond = LHS;
   1202     SDValue Zero = RHS;
   1203     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
   1204     if (CompareVT != VT) {
   1205       // Bitcast True / False to the correct types.  This will end up being
   1206       // a nop, but it allows us to define only a single pattern in the
   1207       // .TD files for each CND* instruction rather than having to have
   1208       // one pattern for integer True/False and one for fp True/False
   1209       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
   1210       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
   1211     }
   1212 
   1213     switch (CCOpcode) {
   1214     case ISD::SETONE:
   1215     case ISD::SETUNE:
   1216     case ISD::SETNE:
   1217       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
   1218       Temp = True;
   1219       True = False;
   1220       False = Temp;
   1221       break;
   1222     default:
   1223       break;
   1224     }
   1225     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
   1226         Cond, Zero,
   1227         True, False,
   1228         DAG.getCondCode(CCOpcode));
   1229     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
   1230   }
   1231 
   1232   // If we make it this for it means we have no native instructions to handle
   1233   // this SELECT_CC, so we must lower it.
   1234   SDValue HWTrue, HWFalse;
   1235 
   1236   if (CompareVT == MVT::f32) {
   1237     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
   1238     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
   1239   } else if (CompareVT == MVT::i32) {
   1240     HWTrue = DAG.getConstant(-1, CompareVT);
   1241     HWFalse = DAG.getConstant(0, CompareVT);
   1242   }
   1243   else {
   1244     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
   1245   }
   1246 
   1247   // Lower this unsupported SELECT_CC into a combination of two supported
   1248   // SELECT_CC operations.
   1249   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
   1250 
   1251   return DAG.getNode(ISD::SELECT_CC, DL, VT,
   1252       Cond, HWFalse,
   1253       True, False,
   1254       DAG.getCondCode(ISD::SETNE));
   1255 }
   1256 
   1257 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
   1258 /// convert these pointers to a register index.  Each register holds
   1259 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
   1260 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
   1261 /// for indirect addressing.
   1262 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
   1263                                                unsigned StackWidth,
   1264                                                SelectionDAG &DAG) const {
   1265   unsigned SRLPad;
   1266   switch(StackWidth) {
   1267   case 1:
   1268     SRLPad = 2;
   1269     break;
   1270   case 2:
   1271     SRLPad = 3;
   1272     break;
   1273   case 4:
   1274     SRLPad = 4;
   1275     break;
   1276   default: llvm_unreachable("Invalid stack width");
   1277   }
   1278 
   1279   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
   1280                      DAG.getConstant(SRLPad, MVT::i32));
   1281 }
   1282 
   1283 void R600TargetLowering::getStackAddress(unsigned StackWidth,
   1284                                          unsigned ElemIdx,
   1285                                          unsigned &Channel,
   1286                                          unsigned &PtrIncr) const {
   1287   switch (StackWidth) {
   1288   default:
   1289   case 1:
   1290     Channel = 0;
   1291     if (ElemIdx > 0) {
   1292       PtrIncr = 1;
   1293     } else {
   1294       PtrIncr = 0;
   1295     }
   1296     break;
   1297   case 2:
   1298     Channel = ElemIdx % 2;
   1299     if (ElemIdx == 2) {
   1300       PtrIncr = 1;
   1301     } else {
   1302       PtrIncr = 0;
   1303     }
   1304     break;
   1305   case 4:
   1306     Channel = ElemIdx;
   1307     PtrIncr = 0;
   1308     break;
   1309   }
   1310 }
   1311 
   1312 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   1313   SDLoc DL(Op);
   1314   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
   1315   SDValue Chain = Op.getOperand(0);
   1316   SDValue Value = Op.getOperand(1);
   1317   SDValue Ptr = Op.getOperand(2);
   1318 
   1319   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
   1320   if (Result.getNode()) {
   1321     return Result;
   1322   }
   1323 
   1324   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
   1325     if (StoreNode->isTruncatingStore()) {
   1326       EVT VT = Value.getValueType();
   1327       assert(VT.bitsLE(MVT::i32));
   1328       EVT MemVT = StoreNode->getMemoryVT();
   1329       SDValue MaskConstant;
   1330       if (MemVT == MVT::i8) {
   1331         MaskConstant = DAG.getConstant(0xFF, MVT::i32);
   1332       } else {
   1333         assert(MemVT == MVT::i16);
   1334         MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
   1335       }
   1336       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
   1337                                       DAG.getConstant(2, MVT::i32));
   1338       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
   1339                                       DAG.getConstant(0x00000003, VT));
   1340       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
   1341       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
   1342                                    DAG.getConstant(3, VT));
   1343       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
   1344       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
   1345       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
   1346       // vector instead.
   1347       SDValue Src[4] = {
   1348         ShiftedValue,
   1349         DAG.getConstant(0, MVT::i32),
   1350         DAG.getConstant(0, MVT::i32),
   1351         Mask
   1352       };
   1353       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
   1354       SDValue Args[3] = { Chain, Input, DWordAddr };
   1355       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
   1356                                      Op->getVTList(), Args, MemVT,
   1357                                      StoreNode->getMemOperand());
   1358     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
   1359                Value.getValueType().bitsGE(MVT::i32)) {
   1360       // Convert pointer from byte address to dword address.
   1361       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
   1362                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
   1363                                     Ptr, DAG.getConstant(2, MVT::i32)));
   1364 
   1365       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
   1366         llvm_unreachable("Truncated and indexed stores not supported yet");
   1367       } else {
   1368         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
   1369       }
   1370       return Chain;
   1371     }
   1372   }
   1373 
   1374   EVT ValueVT = Value.getValueType();
   1375 
   1376   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
   1377     return SDValue();
   1378   }
   1379 
   1380   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
   1381   if (Ret.getNode()) {
   1382     return Ret;
   1383   }
   1384   // Lowering for indirect addressing
   1385 
   1386   const MachineFunction &MF = DAG.getMachineFunction();
   1387   const AMDGPUFrameLowering *TFL =
   1388       static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
   1389   unsigned StackWidth = TFL->getStackWidth(MF);
   1390 
   1391   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
   1392 
   1393   if (ValueVT.isVector()) {
   1394     unsigned NumElemVT = ValueVT.getVectorNumElements();
   1395     EVT ElemVT = ValueVT.getVectorElementType();
   1396     SmallVector<SDValue, 4> Stores(NumElemVT);
   1397 
   1398     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
   1399                                       "vector width in load");
   1400 
   1401     for (unsigned i = 0; i < NumElemVT; ++i) {
   1402       unsigned Channel, PtrIncr;
   1403       getStackAddress(StackWidth, i, Channel, PtrIncr);
   1404       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
   1405                         DAG.getConstant(PtrIncr, MVT::i32));
   1406       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
   1407                                  Value, DAG.getConstant(i, MVT::i32));
   1408 
   1409       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
   1410                               Chain, Elem, Ptr,
   1411                               DAG.getTargetConstant(Channel, MVT::i32));
   1412     }
   1413      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
   1414    } else {
   1415     if (ValueVT == MVT::i8) {
   1416       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
   1417     }
   1418     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
   1419     DAG.getTargetConstant(0, MVT::i32)); // Channel
   1420   }
   1421 
   1422   return Chain;
   1423 }
   1424 
   1425 // return (512 + (kc_bank << 12)
   1426 static int
   1427 ConstantAddressBlock(unsigned AddressSpace) {
   1428   switch (AddressSpace) {
   1429   case AMDGPUAS::CONSTANT_BUFFER_0:
   1430     return 512;
   1431   case AMDGPUAS::CONSTANT_BUFFER_1:
   1432     return 512 + 4096;
   1433   case AMDGPUAS::CONSTANT_BUFFER_2:
   1434     return 512 + 4096 * 2;
   1435   case AMDGPUAS::CONSTANT_BUFFER_3:
   1436     return 512 + 4096 * 3;
   1437   case AMDGPUAS::CONSTANT_BUFFER_4:
   1438     return 512 + 4096 * 4;
   1439   case AMDGPUAS::CONSTANT_BUFFER_5:
   1440     return 512 + 4096 * 5;
   1441   case AMDGPUAS::CONSTANT_BUFFER_6:
   1442     return 512 + 4096 * 6;
   1443   case AMDGPUAS::CONSTANT_BUFFER_7:
   1444     return 512 + 4096 * 7;
   1445   case AMDGPUAS::CONSTANT_BUFFER_8:
   1446     return 512 + 4096 * 8;
   1447   case AMDGPUAS::CONSTANT_BUFFER_9:
   1448     return 512 + 4096 * 9;
   1449   case AMDGPUAS::CONSTANT_BUFFER_10:
   1450     return 512 + 4096 * 10;
   1451   case AMDGPUAS::CONSTANT_BUFFER_11:
   1452     return 512 + 4096 * 11;
   1453   case AMDGPUAS::CONSTANT_BUFFER_12:
   1454     return 512 + 4096 * 12;
   1455   case AMDGPUAS::CONSTANT_BUFFER_13:
   1456     return 512 + 4096 * 13;
   1457   case AMDGPUAS::CONSTANT_BUFFER_14:
   1458     return 512 + 4096 * 14;
   1459   case AMDGPUAS::CONSTANT_BUFFER_15:
   1460     return 512 + 4096 * 15;
   1461   default:
   1462     return -1;
   1463   }
   1464 }
   1465 
   1466 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
   1467 {
   1468   EVT VT = Op.getValueType();
   1469   SDLoc DL(Op);
   1470   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
   1471   SDValue Chain = Op.getOperand(0);
   1472   SDValue Ptr = Op.getOperand(1);
   1473   SDValue LoweredLoad;
   1474 
   1475   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
   1476   if (Ret.getNode()) {
   1477     SDValue Ops[2] = {
   1478       Ret,
   1479       Chain
   1480     };
   1481     return DAG.getMergeValues(Ops, DL);
   1482   }
   1483 
   1484   // Lower loads constant address space global variable loads
   1485   if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
   1486       isa<GlobalVariable>(GetUnderlyingObject(
   1487           LoadNode->getMemOperand()->getValue(), *getDataLayout()))) {
   1488 
   1489     SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL,
   1490         getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
   1491     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
   1492         DAG.getConstant(2, MVT::i32));
   1493     return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
   1494                        LoadNode->getChain(), Ptr,
   1495                        DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
   1496   }
   1497 
   1498   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
   1499     SDValue MergedValues[2] = {
   1500       ScalarizeVectorLoad(Op, DAG),
   1501       Chain
   1502     };
   1503     return DAG.getMergeValues(MergedValues, DL);
   1504   }
   1505 
   1506   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
   1507   if (ConstantBlock > -1 &&
   1508       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
   1509        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
   1510     SDValue Result;
   1511     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
   1512         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
   1513         isa<ConstantSDNode>(Ptr)) {
   1514       SDValue Slots[4];
   1515       for (unsigned i = 0; i < 4; i++) {
   1516         // We want Const position encoded with the following formula :
   1517         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
   1518         // const_index is Ptr computed by llvm using an alignment of 16.
   1519         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
   1520         // then div by 4 at the ISel step
   1521         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
   1522             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
   1523         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
   1524       }
   1525       EVT NewVT = MVT::v4i32;
   1526       unsigned NumElements = 4;
   1527       if (VT.isVector()) {
   1528         NewVT = VT;
   1529         NumElements = VT.getVectorNumElements();
   1530       }
   1531       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
   1532                            makeArrayRef(Slots, NumElements));
   1533     } else {
   1534       // non-constant ptr can't be folded, keeps it as a v4f32 load
   1535       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
   1536           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
   1537           DAG.getConstant(LoadNode->getAddressSpace() -
   1538                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
   1539           );
   1540     }
   1541 
   1542     if (!VT.isVector()) {
   1543       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
   1544           DAG.getConstant(0, MVT::i32));
   1545     }
   1546 
   1547     SDValue MergedValues[2] = {
   1548       Result,
   1549       Chain
   1550     };
   1551     return DAG.getMergeValues(MergedValues, DL);
   1552   }
   1553 
   1554   // For most operations returning SDValue() will result in the node being
   1555   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
   1556   // need to manually expand loads that may be legal in some address spaces and
   1557   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
   1558   // compute shaders, since the data is sign extended when it is uploaded to the
   1559   // buffer. However SEXT loads from other address spaces are not supported, so
   1560   // we need to expand them here.
   1561   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
   1562     EVT MemVT = LoadNode->getMemoryVT();
   1563     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
   1564     SDValue ShiftAmount =
   1565           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
   1566     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
   1567                                   LoadNode->getPointerInfo(), MemVT,
   1568                                   LoadNode->isVolatile(),
   1569                                   LoadNode->isNonTemporal(),
   1570                                   LoadNode->isInvariant(),
   1571                                   LoadNode->getAlignment());
   1572     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
   1573     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
   1574 
   1575     SDValue MergedValues[2] = { Sra, Chain };
   1576     return DAG.getMergeValues(MergedValues, DL);
   1577   }
   1578 
   1579   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
   1580     return SDValue();
   1581   }
   1582 
   1583   // Lowering for indirect addressing
   1584   const MachineFunction &MF = DAG.getMachineFunction();
   1585   const AMDGPUFrameLowering *TFL =
   1586       static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
   1587   unsigned StackWidth = TFL->getStackWidth(MF);
   1588 
   1589   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
   1590 
   1591   if (VT.isVector()) {
   1592     unsigned NumElemVT = VT.getVectorNumElements();
   1593     EVT ElemVT = VT.getVectorElementType();
   1594     SDValue Loads[4];
   1595 
   1596     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
   1597                                       "vector width in load");
   1598 
   1599     for (unsigned i = 0; i < NumElemVT; ++i) {
   1600       unsigned Channel, PtrIncr;
   1601       getStackAddress(StackWidth, i, Channel, PtrIncr);
   1602       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
   1603                         DAG.getConstant(PtrIncr, MVT::i32));
   1604       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
   1605                              Chain, Ptr,
   1606                              DAG.getTargetConstant(Channel, MVT::i32),
   1607                              Op.getOperand(2));
   1608     }
   1609     for (unsigned i = NumElemVT; i < 4; ++i) {
   1610       Loads[i] = DAG.getUNDEF(ElemVT);
   1611     }
   1612     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
   1613     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
   1614   } else {
   1615     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
   1616                               Chain, Ptr,
   1617                               DAG.getTargetConstant(0, MVT::i32), // Channel
   1618                               Op.getOperand(2));
   1619   }
   1620 
   1621   SDValue Ops[2] = {
   1622     LoweredLoad,
   1623     Chain
   1624   };
   1625 
   1626   return DAG.getMergeValues(Ops, DL);
   1627 }
   1628 
   1629 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   1630   SDValue Chain = Op.getOperand(0);
   1631   SDValue Cond  = Op.getOperand(1);
   1632   SDValue Jump  = Op.getOperand(2);
   1633 
   1634   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
   1635                      Chain, Jump, Cond);
   1636 }
   1637 
   1638 /// XXX Only kernel functions are supported, so we can assume for now that
   1639 /// every function is a kernel function, but in the future we should use
   1640 /// separate calling conventions for kernel and non-kernel functions.
   1641 SDValue R600TargetLowering::LowerFormalArguments(
   1642                                       SDValue Chain,
   1643                                       CallingConv::ID CallConv,
   1644                                       bool isVarArg,
   1645                                       const SmallVectorImpl<ISD::InputArg> &Ins,
   1646                                       SDLoc DL, SelectionDAG &DAG,
   1647                                       SmallVectorImpl<SDValue> &InVals) const {
   1648   SmallVector<CCValAssign, 16> ArgLocs;
   1649   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
   1650                  *DAG.getContext());
   1651   MachineFunction &MF = DAG.getMachineFunction();
   1652   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
   1653 
   1654   SmallVector<ISD::InputArg, 8> LocalIns;
   1655 
   1656   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
   1657 
   1658   AnalyzeFormalArguments(CCInfo, LocalIns);
   1659 
   1660   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
   1661     CCValAssign &VA = ArgLocs[i];
   1662     const ISD::InputArg &In = Ins[i];
   1663     EVT VT = In.VT;
   1664     EVT MemVT = VA.getLocVT();
   1665     if (!VT.isVector() && MemVT.isVector()) {
   1666       // Get load source type if scalarized.
   1667       MemVT = MemVT.getVectorElementType();
   1668     }
   1669 
   1670     if (MFI->getShaderType() != ShaderType::COMPUTE) {
   1671       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
   1672       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
   1673       InVals.push_back(Register);
   1674       continue;
   1675     }
   1676 
   1677     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
   1678                                           AMDGPUAS::CONSTANT_BUFFER_0);
   1679 
   1680     // i64 isn't a legal type, so the register type used ends up as i32, which
   1681     // isn't expected here. It attempts to create this sextload, but it ends up
   1682     // being invalid. Somehow this seems to work with i64 arguments, but breaks
   1683     // for <1 x i64>.
   1684 
   1685     // The first 36 bytes of the input buffer contains information about
   1686     // thread group and global sizes.
   1687     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
   1688     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
   1689       // FIXME: This should really check the extload type, but the handling of
   1690       // extload vector parameters seems to be broken.
   1691 
   1692       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
   1693       Ext = ISD::SEXTLOAD;
   1694     }
   1695 
   1696     // Compute the offset from the value.
   1697     // XXX - I think PartOffset should give you this, but it seems to give the
   1698     // size of the register which isn't useful.
   1699 
   1700     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
   1701     unsigned PartOffset = VA.getLocMemOffset();
   1702     unsigned Offset = 36 + VA.getLocMemOffset();
   1703 
   1704     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
   1705     SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
   1706                               DAG.getConstant(Offset, MVT::i32),
   1707                               DAG.getUNDEF(MVT::i32),
   1708                               PtrInfo,
   1709                               MemVT, false, true, true, 4);
   1710 
   1711     // 4 is the preferred alignment for the CONSTANT memory space.
   1712     InVals.push_back(Arg);
   1713     MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
   1714   }
   1715   return Chain;
   1716 }
   1717 
   1718 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   1719    if (!VT.isVector())
   1720      return MVT::i32;
   1721    return VT.changeVectorElementTypeToInteger();
   1722 }
   1723 
   1724 static SDValue CompactSwizzlableVector(
   1725   SelectionDAG &DAG, SDValue VectorEntry,
   1726   DenseMap<unsigned, unsigned> &RemapSwizzle) {
   1727   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
   1728   assert(RemapSwizzle.empty());
   1729   SDValue NewBldVec[4] = {
   1730     VectorEntry.getOperand(0),
   1731     VectorEntry.getOperand(1),
   1732     VectorEntry.getOperand(2),
   1733     VectorEntry.getOperand(3)
   1734   };
   1735 
   1736   for (unsigned i = 0; i < 4; i++) {
   1737     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
   1738       // We mask write here to teach later passes that the ith element of this
   1739       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
   1740       // break false dependencies and additionnaly make assembly easier to read.
   1741       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
   1742     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
   1743       if (C->isZero()) {
   1744         RemapSwizzle[i] = 4; // SEL_0
   1745         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
   1746       } else if (C->isExactlyValue(1.0)) {
   1747         RemapSwizzle[i] = 5; // SEL_1
   1748         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
   1749       }
   1750     }
   1751 
   1752     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
   1753       continue;
   1754     for (unsigned j = 0; j < i; j++) {
   1755       if (NewBldVec[i] == NewBldVec[j]) {
   1756         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
   1757         RemapSwizzle[i] = j;
   1758         break;
   1759       }
   1760     }
   1761   }
   1762 
   1763   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
   1764                      VectorEntry.getValueType(), NewBldVec);
   1765 }
   1766 
   1767 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
   1768                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
   1769   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
   1770   assert(RemapSwizzle.empty());
   1771   SDValue NewBldVec[4] = {
   1772       VectorEntry.getOperand(0),
   1773       VectorEntry.getOperand(1),
   1774       VectorEntry.getOperand(2),
   1775       VectorEntry.getOperand(3)
   1776   };
   1777   bool isUnmovable[4] = { false, false, false, false };
   1778   for (unsigned i = 0; i < 4; i++) {
   1779     RemapSwizzle[i] = i;
   1780     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
   1781       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
   1782           ->getZExtValue();
   1783       if (i == Idx)
   1784         isUnmovable[Idx] = true;
   1785     }
   1786   }
   1787 
   1788   for (unsigned i = 0; i < 4; i++) {
   1789     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
   1790       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
   1791           ->getZExtValue();
   1792       if (isUnmovable[Idx])
   1793         continue;
   1794       // Swap i and Idx
   1795       std::swap(NewBldVec[Idx], NewBldVec[i]);
   1796       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
   1797       break;
   1798     }
   1799   }
   1800 
   1801   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
   1802                      VectorEntry.getValueType(), NewBldVec);
   1803 }
   1804 
   1805 
   1806 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
   1807 SDValue Swz[4], SelectionDAG &DAG) const {
   1808   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
   1809   // Old -> New swizzle values
   1810   DenseMap<unsigned, unsigned> SwizzleRemap;
   1811 
   1812   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
   1813   for (unsigned i = 0; i < 4; i++) {
   1814     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
   1815     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
   1816       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
   1817   }
   1818 
   1819   SwizzleRemap.clear();
   1820   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
   1821   for (unsigned i = 0; i < 4; i++) {
   1822     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
   1823     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
   1824       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
   1825   }
   1826 
   1827   return BuildVector;
   1828 }
   1829 
   1830 
   1831 //===----------------------------------------------------------------------===//
   1832 // Custom DAG Optimizations
   1833 //===----------------------------------------------------------------------===//
   1834 
   1835 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
   1836                                               DAGCombinerInfo &DCI) const {
   1837   SelectionDAG &DAG = DCI.DAG;
   1838 
   1839   switch (N->getOpcode()) {
   1840   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
   1841   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
   1842   case ISD::FP_ROUND: {
   1843       SDValue Arg = N->getOperand(0);
   1844       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
   1845         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
   1846                            Arg.getOperand(0));
   1847       }
   1848       break;
   1849     }
   1850 
   1851   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
   1852   // (i32 select_cc f32, f32, -1, 0 cc)
   1853   //
   1854   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
   1855   // this to one of the SET*_DX10 instructions.
   1856   case ISD::FP_TO_SINT: {
   1857     SDValue FNeg = N->getOperand(0);
   1858     if (FNeg.getOpcode() != ISD::FNEG) {
   1859       return SDValue();
   1860     }
   1861     SDValue SelectCC = FNeg.getOperand(0);
   1862     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
   1863         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
   1864         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
   1865         !isHWTrueValue(SelectCC.getOperand(2)) ||
   1866         !isHWFalseValue(SelectCC.getOperand(3))) {
   1867       return SDValue();
   1868     }
   1869 
   1870     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
   1871                            SelectCC.getOperand(0), // LHS
   1872                            SelectCC.getOperand(1), // RHS
   1873                            DAG.getConstant(-1, MVT::i32), // True
   1874                            DAG.getConstant(0, MVT::i32),  // False
   1875                            SelectCC.getOperand(4)); // CC
   1876 
   1877     break;
   1878   }
   1879 
   1880   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
   1881   // => build_vector elt0, ... , NewEltIdx, ... , eltN
   1882   case ISD::INSERT_VECTOR_ELT: {
   1883     SDValue InVec = N->getOperand(0);
   1884     SDValue InVal = N->getOperand(1);
   1885     SDValue EltNo = N->getOperand(2);
   1886     SDLoc dl(N);
   1887 
   1888     // If the inserted element is an UNDEF, just use the input vector.
   1889     if (InVal.getOpcode() == ISD::UNDEF)
   1890       return InVec;
   1891 
   1892     EVT VT = InVec.getValueType();
   1893 
   1894     // If we can't generate a legal BUILD_VECTOR, exit
   1895     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
   1896       return SDValue();
   1897 
   1898     // Check that we know which element is being inserted
   1899     if (!isa<ConstantSDNode>(EltNo))
   1900       return SDValue();
   1901     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
   1902 
   1903     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
   1904     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
   1905     // vector elements.
   1906     SmallVector<SDValue, 8> Ops;
   1907     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
   1908       Ops.append(InVec.getNode()->op_begin(),
   1909                  InVec.getNode()->op_end());
   1910     } else if (InVec.getOpcode() == ISD::UNDEF) {
   1911       unsigned NElts = VT.getVectorNumElements();
   1912       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
   1913     } else {
   1914       return SDValue();
   1915     }
   1916 
   1917     // Insert the element
   1918     if (Elt < Ops.size()) {
   1919       // All the operands of BUILD_VECTOR must have the same type;
   1920       // we enforce that here.
   1921       EVT OpVT = Ops[0].getValueType();
   1922       if (InVal.getValueType() != OpVT)
   1923         InVal = OpVT.bitsGT(InVal.getValueType()) ?
   1924           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
   1925           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
   1926       Ops[Elt] = InVal;
   1927     }
   1928 
   1929     // Return the new vector
   1930     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   1931   }
   1932 
   1933   // Extract_vec (Build_vector) generated by custom lowering
   1934   // also needs to be customly combined
   1935   case ISD::EXTRACT_VECTOR_ELT: {
   1936     SDValue Arg = N->getOperand(0);
   1937     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
   1938       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
   1939         unsigned Element = Const->getZExtValue();
   1940         return Arg->getOperand(Element);
   1941       }
   1942     }
   1943     if (Arg.getOpcode() == ISD::BITCAST &&
   1944         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
   1945       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
   1946         unsigned Element = Const->getZExtValue();
   1947         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
   1948             Arg->getOperand(0).getOperand(Element));
   1949       }
   1950     }
   1951   }
   1952 
   1953   case ISD::SELECT_CC: {
   1954     // Try common optimizations
   1955     SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
   1956     if (Ret.getNode())
   1957       return Ret;
   1958 
   1959     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
   1960     //      selectcc x, y, a, b, inv(cc)
   1961     //
   1962     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
   1963     //      selectcc x, y, a, b, cc
   1964     SDValue LHS = N->getOperand(0);
   1965     if (LHS.getOpcode() != ISD::SELECT_CC) {
   1966       return SDValue();
   1967     }
   1968 
   1969     SDValue RHS = N->getOperand(1);
   1970     SDValue True = N->getOperand(2);
   1971     SDValue False = N->getOperand(3);
   1972     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
   1973 
   1974     if (LHS.getOperand(2).getNode() != True.getNode() ||
   1975         LHS.getOperand(3).getNode() != False.getNode() ||
   1976         RHS.getNode() != False.getNode()) {
   1977       return SDValue();
   1978     }
   1979 
   1980     switch (NCC) {
   1981     default: return SDValue();
   1982     case ISD::SETNE: return LHS;
   1983     case ISD::SETEQ: {
   1984       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
   1985       LHSCC = ISD::getSetCCInverse(LHSCC,
   1986                                   LHS.getOperand(0).getValueType().isInteger());
   1987       if (DCI.isBeforeLegalizeOps() ||
   1988           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
   1989         return DAG.getSelectCC(SDLoc(N),
   1990                                LHS.getOperand(0),
   1991                                LHS.getOperand(1),
   1992                                LHS.getOperand(2),
   1993                                LHS.getOperand(3),
   1994                                LHSCC);
   1995       break;
   1996     }
   1997     }
   1998     return SDValue();
   1999   }
   2000 
   2001   case AMDGPUISD::EXPORT: {
   2002     SDValue Arg = N->getOperand(1);
   2003     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
   2004       break;
   2005 
   2006     SDValue NewArgs[8] = {
   2007       N->getOperand(0), // Chain
   2008       SDValue(),
   2009       N->getOperand(2), // ArrayBase
   2010       N->getOperand(3), // Type
   2011       N->getOperand(4), // SWZ_X
   2012       N->getOperand(5), // SWZ_Y
   2013       N->getOperand(6), // SWZ_Z
   2014       N->getOperand(7) // SWZ_W
   2015     };
   2016     SDLoc DL(N);
   2017     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
   2018     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
   2019   }
   2020   case AMDGPUISD::TEXTURE_FETCH: {
   2021     SDValue Arg = N->getOperand(1);
   2022     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
   2023       break;
   2024 
   2025     SDValue NewArgs[19] = {
   2026       N->getOperand(0),
   2027       N->getOperand(1),
   2028       N->getOperand(2),
   2029       N->getOperand(3),
   2030       N->getOperand(4),
   2031       N->getOperand(5),
   2032       N->getOperand(6),
   2033       N->getOperand(7),
   2034       N->getOperand(8),
   2035       N->getOperand(9),
   2036       N->getOperand(10),
   2037       N->getOperand(11),
   2038       N->getOperand(12),
   2039       N->getOperand(13),
   2040       N->getOperand(14),
   2041       N->getOperand(15),
   2042       N->getOperand(16),
   2043       N->getOperand(17),
   2044       N->getOperand(18),
   2045     };
   2046     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
   2047     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
   2048         NewArgs);
   2049   }
   2050   }
   2051 
   2052   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
   2053 }
   2054 
   2055 static bool
   2056 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
   2057             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
   2058   const R600InstrInfo *TII =
   2059       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
   2060   if (!Src.isMachineOpcode())
   2061     return false;
   2062   switch (Src.getMachineOpcode()) {
   2063   case AMDGPU::FNEG_R600:
   2064     if (!Neg.getNode())
   2065       return false;
   2066     Src = Src.getOperand(0);
   2067     Neg = DAG.getTargetConstant(1, MVT::i32);
   2068     return true;
   2069   case AMDGPU::FABS_R600:
   2070     if (!Abs.getNode())
   2071       return false;
   2072     Src = Src.getOperand(0);
   2073     Abs = DAG.getTargetConstant(1, MVT::i32);
   2074     return true;
   2075   case AMDGPU::CONST_COPY: {
   2076     unsigned Opcode = ParentNode->getMachineOpcode();
   2077     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
   2078 
   2079     if (!Sel.getNode())
   2080       return false;
   2081 
   2082     SDValue CstOffset = Src.getOperand(0);
   2083     if (ParentNode->getValueType(0).isVector())
   2084       return false;
   2085 
   2086     // Gather constants values
   2087     int SrcIndices[] = {
   2088       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
   2089       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
   2090       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
   2091       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
   2092       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
   2093       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
   2094       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
   2095       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
   2096       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
   2097       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
   2098       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
   2099     };
   2100     std::vector<unsigned> Consts;
   2101     for (int OtherSrcIdx : SrcIndices) {
   2102       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
   2103       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
   2104         continue;
   2105       if (HasDst) {
   2106         OtherSrcIdx--;
   2107         OtherSelIdx--;
   2108       }
   2109       if (RegisterSDNode *Reg =
   2110           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
   2111         if (Reg->getReg() == AMDGPU::ALU_CONST) {
   2112           ConstantSDNode *Cst
   2113             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
   2114           Consts.push_back(Cst->getZExtValue());
   2115         }
   2116       }
   2117     }
   2118 
   2119     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
   2120     Consts.push_back(Cst->getZExtValue());
   2121     if (!TII->fitsConstReadLimitations(Consts)) {
   2122       return false;
   2123     }
   2124 
   2125     Sel = CstOffset;
   2126     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
   2127     return true;
   2128   }
   2129   case AMDGPU::MOV_IMM_I32:
   2130   case AMDGPU::MOV_IMM_F32: {
   2131     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
   2132     uint64_t ImmValue = 0;
   2133 
   2134 
   2135     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
   2136       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
   2137       float FloatValue = FPC->getValueAPF().convertToFloat();
   2138       if (FloatValue == 0.0) {
   2139         ImmReg = AMDGPU::ZERO;
   2140       } else if (FloatValue == 0.5) {
   2141         ImmReg = AMDGPU::HALF;
   2142       } else if (FloatValue == 1.0) {
   2143         ImmReg = AMDGPU::ONE;
   2144       } else {
   2145         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
   2146       }
   2147     } else {
   2148       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
   2149       uint64_t Value = C->getZExtValue();
   2150       if (Value == 0) {
   2151         ImmReg = AMDGPU::ZERO;
   2152       } else if (Value == 1) {
   2153         ImmReg = AMDGPU::ONE_INT;
   2154       } else {
   2155         ImmValue = Value;
   2156       }
   2157     }
   2158 
   2159     // Check that we aren't already using an immediate.
   2160     // XXX: It's possible for an instruction to have more than one
   2161     // immediate operand, but this is not supported yet.
   2162     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
   2163       if (!Imm.getNode())
   2164         return false;
   2165       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
   2166       assert(C);
   2167       if (C->getZExtValue())
   2168         return false;
   2169       Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
   2170     }
   2171     Src = DAG.getRegister(ImmReg, MVT::i32);
   2172     return true;
   2173   }
   2174   default:
   2175     return false;
   2176   }
   2177 }
   2178 
   2179 
   2180 /// \brief Fold the instructions after selecting them
   2181 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
   2182                                             SelectionDAG &DAG) const {
   2183   const R600InstrInfo *TII =
   2184       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
   2185   if (!Node->isMachineOpcode())
   2186     return Node;
   2187   unsigned Opcode = Node->getMachineOpcode();
   2188   SDValue FakeOp;
   2189 
   2190   std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
   2191 
   2192   if (Opcode == AMDGPU::DOT_4) {
   2193     int OperandIdx[] = {
   2194       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
   2195       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
   2196       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
   2197       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
   2198       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
   2199       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
   2200       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
   2201       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
   2202         };
   2203     int NegIdx[] = {
   2204       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
   2205       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
   2206       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
   2207       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
   2208       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
   2209       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
   2210       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
   2211       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
   2212     };
   2213     int AbsIdx[] = {
   2214       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
   2215       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
   2216       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
   2217       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
   2218       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
   2219       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
   2220       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
   2221       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
   2222     };
   2223     for (unsigned i = 0; i < 8; i++) {
   2224       if (OperandIdx[i] < 0)
   2225         return Node;
   2226       SDValue &Src = Ops[OperandIdx[i] - 1];
   2227       SDValue &Neg = Ops[NegIdx[i] - 1];
   2228       SDValue &Abs = Ops[AbsIdx[i] - 1];
   2229       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
   2230       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
   2231       if (HasDst)
   2232         SelIdx--;
   2233       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
   2234       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
   2235         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
   2236     }
   2237   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
   2238     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
   2239       SDValue &Src = Ops[i];
   2240       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
   2241         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
   2242     }
   2243   } else if (Opcode == AMDGPU::CLAMP_R600) {
   2244     SDValue Src = Node->getOperand(0);
   2245     if (!Src.isMachineOpcode() ||
   2246         !TII->hasInstrModifiers(Src.getMachineOpcode()))
   2247       return Node;
   2248     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
   2249         AMDGPU::OpName::clamp);
   2250     if (ClampIdx < 0)
   2251       return Node;
   2252     std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
   2253     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
   2254     return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
   2255         Node->getVTList(), Ops);
   2256   } else {
   2257     if (!TII->hasInstrModifiers(Opcode))
   2258       return Node;
   2259     int OperandIdx[] = {
   2260       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
   2261       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
   2262       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
   2263     };
   2264     int NegIdx[] = {
   2265       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
   2266       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
   2267       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
   2268     };
   2269     int AbsIdx[] = {
   2270       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
   2271       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
   2272       -1
   2273     };
   2274     for (unsigned i = 0; i < 3; i++) {
   2275       if (OperandIdx[i] < 0)
   2276         return Node;
   2277       SDValue &Src = Ops[OperandIdx[i] - 1];
   2278       SDValue &Neg = Ops[NegIdx[i] - 1];
   2279       SDValue FakeAbs;
   2280       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
   2281       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
   2282       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
   2283       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
   2284       if (HasDst) {
   2285         SelIdx--;
   2286         ImmIdx--;
   2287       }
   2288       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
   2289       SDValue &Imm = Ops[ImmIdx];
   2290       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
   2291         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
   2292     }
   2293   }
   2294 
   2295   return Node;
   2296 }
   2297