Home | History | Annotate | Download | only in R600
      1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 /// \brief Custom DAG lowering for R600
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "R600ISelLowering.h"
     16 #include "AMDGPUFrameLowering.h"
     17 #include "AMDGPUIntrinsicInfo.h"
     18 #include "AMDGPUSubtarget.h"
     19 #include "R600Defines.h"
     20 #include "R600InstrInfo.h"
     21 #include "R600MachineFunctionInfo.h"
     22 #include "llvm/CodeGen/CallingConvLower.h"
     23 #include "llvm/CodeGen/MachineFrameInfo.h"
     24 #include "llvm/CodeGen/MachineInstrBuilder.h"
     25 #include "llvm/CodeGen/MachineRegisterInfo.h"
     26 #include "llvm/CodeGen/SelectionDAG.h"
     27 #include "llvm/IR/Argument.h"
     28 #include "llvm/IR/Function.h"
     29 
     30 using namespace llvm;
     31 
     32 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
     33     AMDGPUTargetLowering(TM),
     34     Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
     35   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
     36   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
     37   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
     38   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
     39   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
     40   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
     41 
     42   computeRegisterProperties();
     43 
     44   // Set condition code actions
     45   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
     46   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
     47   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
     48   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
     49   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
     50   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
     51   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
     52   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
     53   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
     54   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
     55   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
     56   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
     57 
     58   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
     59   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
     60   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
     61   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
     62 
     63   setOperationAction(ISD::FCOS, MVT::f32, Custom);
     64   setOperationAction(ISD::FSIN, MVT::f32, Custom);
     65 
     66   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
     67   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
     68 
     69   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
     70   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
     71   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
     72 
     73   setOperationAction(ISD::FSUB, MVT::f32, Expand);
     74 
     75   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
     76   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
     77   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
     78 
     79   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
     80   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
     81 
     82   setOperationAction(ISD::SETCC, MVT::i32, Expand);
     83   setOperationAction(ISD::SETCC, MVT::f32, Expand);
     84   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
     85 
     86   setOperationAction(ISD::SELECT, MVT::i32, Expand);
     87   setOperationAction(ISD::SELECT, MVT::f32, Expand);
     88   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
     89   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
     90 
     91   // Expand sign extension of vectors
     92   if (!Subtarget->hasBFE())
     93     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
     94 
     95   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
     96   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
     97 
     98   if (!Subtarget->hasBFE())
     99     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
    100   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
    101   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
    102 
    103   if (!Subtarget->hasBFE())
    104     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
    105   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
    106   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
    107 
    108   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
    109   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
    110   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
    111 
    112   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
    113 
    114 
    115   // Legalize loads and stores to the private address space.
    116   setOperationAction(ISD::LOAD, MVT::i32, Custom);
    117   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
    118   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
    119 
    120   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
    121   // spaces, so it is custom lowered to handle those where it isn't.
    122   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
    123   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
    124   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
    125   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
    126   setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
    127   setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
    128 
    129   setOperationAction(ISD::STORE, MVT::i8, Custom);
    130   setOperationAction(ISD::STORE, MVT::i32, Custom);
    131   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
    132   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
    133   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
    134   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
    135 
    136   setOperationAction(ISD::LOAD, MVT::i32, Custom);
    137   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
    138   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
    139 
    140   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
    141   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
    142   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
    143   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
    144 
    145   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
    146   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
    147   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
    148   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
    149 
    150   setTargetDAGCombine(ISD::FP_ROUND);
    151   setTargetDAGCombine(ISD::FP_TO_SINT);
    152   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
    153   setTargetDAGCombine(ISD::SELECT_CC);
    154   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
    155 
    156   setOperationAction(ISD::SUB, MVT::i64, Expand);
    157 
    158   // These should be replaced by UDVIREM, but it does not happen automatically
    159   // during Type Legalization
    160   setOperationAction(ISD::UDIV, MVT::i64, Custom);
    161   setOperationAction(ISD::UREM, MVT::i64, Custom);
    162   setOperationAction(ISD::SDIV, MVT::i64, Custom);
    163   setOperationAction(ISD::SREM, MVT::i64, Custom);
    164 
    165   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
    166   //  to be Legal/Custom in order to avoid library calls.
    167   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
    168   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
    169   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
    170 
    171   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
    172 
    173   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
    174   for (MVT VT : ScalarIntVTs) {
    175     setOperationAction(ISD::ADDC, VT, Expand);
    176     setOperationAction(ISD::SUBC, VT, Expand);
    177     setOperationAction(ISD::ADDE, VT, Expand);
    178     setOperationAction(ISD::SUBE, VT, Expand);
    179   }
    180 
    181   setBooleanContents(ZeroOrNegativeOneBooleanContent);
    182   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
    183   setSchedulingPreference(Sched::Source);
    184 }
    185 
    186 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
    187     MachineInstr * MI, MachineBasicBlock * BB) const {
    188   MachineFunction * MF = BB->getParent();
    189   MachineRegisterInfo &MRI = MF->getRegInfo();
    190   MachineBasicBlock::iterator I = *MI;
    191   const R600InstrInfo *TII =
    192     static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
    193 
    194   switch (MI->getOpcode()) {
    195   default:
    196     // Replace LDS_*_RET instruction that don't have any uses with the
    197     // equivalent LDS_*_NORET instruction.
    198     if (TII->isLDSRetInstr(MI->getOpcode())) {
    199       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
    200       assert(DstIdx != -1);
    201       MachineInstrBuilder NewMI;
    202       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()))
    203         return BB;
    204 
    205       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
    206                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
    207       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
    208         NewMI.addOperand(MI->getOperand(i));
    209       }
    210     } else {
    211       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
    212     }
    213     break;
    214   case AMDGPU::CLAMP_R600: {
    215     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
    216                                                    AMDGPU::MOV,
    217                                                    MI->getOperand(0).getReg(),
    218                                                    MI->getOperand(1).getReg());
    219     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
    220     break;
    221   }
    222 
    223   case AMDGPU::FABS_R600: {
    224     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
    225                                                     AMDGPU::MOV,
    226                                                     MI->getOperand(0).getReg(),
    227                                                     MI->getOperand(1).getReg());
    228     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
    229     break;
    230   }
    231 
    232   case AMDGPU::FNEG_R600: {
    233     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
    234                                                     AMDGPU::MOV,
    235                                                     MI->getOperand(0).getReg(),
    236                                                     MI->getOperand(1).getReg());
    237     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
    238     break;
    239   }
    240 
    241   case AMDGPU::MASK_WRITE: {
    242     unsigned maskedRegister = MI->getOperand(0).getReg();
    243     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
    244     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
    245     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
    246     break;
    247   }
    248 
    249   case AMDGPU::MOV_IMM_F32:
    250     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
    251                      MI->getOperand(1).getFPImm()->getValueAPF()
    252                          .bitcastToAPInt().getZExtValue());
    253     break;
    254   case AMDGPU::MOV_IMM_I32:
    255     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
    256                      MI->getOperand(1).getImm());
    257     break;
    258   case AMDGPU::CONST_COPY: {
    259     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
    260         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
    261     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
    262         MI->getOperand(1).getImm());
    263     break;
    264   }
    265 
    266   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
    267   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
    268   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
    269     unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
    270 
    271     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
    272             .addOperand(MI->getOperand(0))
    273             .addOperand(MI->getOperand(1))
    274             .addImm(EOP); // Set End of program bit
    275     break;
    276   }
    277 
    278   case AMDGPU::TXD: {
    279     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
    280     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
    281     MachineOperand &RID = MI->getOperand(4);
    282     MachineOperand &SID = MI->getOperand(5);
    283     unsigned TextureId = MI->getOperand(6).getImm();
    284     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
    285     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
    286 
    287     switch (TextureId) {
    288     case 5: // Rect
    289       CTX = CTY = 0;
    290       break;
    291     case 6: // Shadow1D
    292       SrcW = SrcZ;
    293       break;
    294     case 7: // Shadow2D
    295       SrcW = SrcZ;
    296       break;
    297     case 8: // ShadowRect
    298       CTX = CTY = 0;
    299       SrcW = SrcZ;
    300       break;
    301     case 9: // 1DArray
    302       SrcZ = SrcY;
    303       CTZ = 0;
    304       break;
    305     case 10: // 2DArray
    306       CTZ = 0;
    307       break;
    308     case 11: // Shadow1DArray
    309       SrcZ = SrcY;
    310       CTZ = 0;
    311       break;
    312     case 12: // Shadow2DArray
    313       CTZ = 0;
    314       break;
    315     }
    316     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
    317             .addOperand(MI->getOperand(3))
    318             .addImm(SrcX)
    319             .addImm(SrcY)
    320             .addImm(SrcZ)
    321             .addImm(SrcW)
    322             .addImm(0)
    323             .addImm(0)
    324             .addImm(0)
    325             .addImm(0)
    326             .addImm(1)
    327             .addImm(2)
    328             .addImm(3)
    329             .addOperand(RID)
    330             .addOperand(SID)
    331             .addImm(CTX)
    332             .addImm(CTY)
    333             .addImm(CTZ)
    334             .addImm(CTW);
    335     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
    336             .addOperand(MI->getOperand(2))
    337             .addImm(SrcX)
    338             .addImm(SrcY)
    339             .addImm(SrcZ)
    340             .addImm(SrcW)
    341             .addImm(0)
    342             .addImm(0)
    343             .addImm(0)
    344             .addImm(0)
    345             .addImm(1)
    346             .addImm(2)
    347             .addImm(3)
    348             .addOperand(RID)
    349             .addOperand(SID)
    350             .addImm(CTX)
    351             .addImm(CTY)
    352             .addImm(CTZ)
    353             .addImm(CTW);
    354     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
    355             .addOperand(MI->getOperand(0))
    356             .addOperand(MI->getOperand(1))
    357             .addImm(SrcX)
    358             .addImm(SrcY)
    359             .addImm(SrcZ)
    360             .addImm(SrcW)
    361             .addImm(0)
    362             .addImm(0)
    363             .addImm(0)
    364             .addImm(0)
    365             .addImm(1)
    366             .addImm(2)
    367             .addImm(3)
    368             .addOperand(RID)
    369             .addOperand(SID)
    370             .addImm(CTX)
    371             .addImm(CTY)
    372             .addImm(CTZ)
    373             .addImm(CTW)
    374             .addReg(T0, RegState::Implicit)
    375             .addReg(T1, RegState::Implicit);
    376     break;
    377   }
    378 
    379   case AMDGPU::TXD_SHADOW: {
    380     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
    381     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
    382     MachineOperand &RID = MI->getOperand(4);
    383     MachineOperand &SID = MI->getOperand(5);
    384     unsigned TextureId = MI->getOperand(6).getImm();
    385     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
    386     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
    387 
    388     switch (TextureId) {
    389     case 5: // Rect
    390       CTX = CTY = 0;
    391       break;
    392     case 6: // Shadow1D
    393       SrcW = SrcZ;
    394       break;
    395     case 7: // Shadow2D
    396       SrcW = SrcZ;
    397       break;
    398     case 8: // ShadowRect
    399       CTX = CTY = 0;
    400       SrcW = SrcZ;
    401       break;
    402     case 9: // 1DArray
    403       SrcZ = SrcY;
    404       CTZ = 0;
    405       break;
    406     case 10: // 2DArray
    407       CTZ = 0;
    408       break;
    409     case 11: // Shadow1DArray
    410       SrcZ = SrcY;
    411       CTZ = 0;
    412       break;
    413     case 12: // Shadow2DArray
    414       CTZ = 0;
    415       break;
    416     }
    417 
    418     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
    419             .addOperand(MI->getOperand(3))
    420             .addImm(SrcX)
    421             .addImm(SrcY)
    422             .addImm(SrcZ)
    423             .addImm(SrcW)
    424             .addImm(0)
    425             .addImm(0)
    426             .addImm(0)
    427             .addImm(0)
    428             .addImm(1)
    429             .addImm(2)
    430             .addImm(3)
    431             .addOperand(RID)
    432             .addOperand(SID)
    433             .addImm(CTX)
    434             .addImm(CTY)
    435             .addImm(CTZ)
    436             .addImm(CTW);
    437     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
    438             .addOperand(MI->getOperand(2))
    439             .addImm(SrcX)
    440             .addImm(SrcY)
    441             .addImm(SrcZ)
    442             .addImm(SrcW)
    443             .addImm(0)
    444             .addImm(0)
    445             .addImm(0)
    446             .addImm(0)
    447             .addImm(1)
    448             .addImm(2)
    449             .addImm(3)
    450             .addOperand(RID)
    451             .addOperand(SID)
    452             .addImm(CTX)
    453             .addImm(CTY)
    454             .addImm(CTZ)
    455             .addImm(CTW);
    456     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
    457             .addOperand(MI->getOperand(0))
    458             .addOperand(MI->getOperand(1))
    459             .addImm(SrcX)
    460             .addImm(SrcY)
    461             .addImm(SrcZ)
    462             .addImm(SrcW)
    463             .addImm(0)
    464             .addImm(0)
    465             .addImm(0)
    466             .addImm(0)
    467             .addImm(1)
    468             .addImm(2)
    469             .addImm(3)
    470             .addOperand(RID)
    471             .addOperand(SID)
    472             .addImm(CTX)
    473             .addImm(CTY)
    474             .addImm(CTZ)
    475             .addImm(CTW)
    476             .addReg(T0, RegState::Implicit)
    477             .addReg(T1, RegState::Implicit);
    478     break;
    479   }
    480 
    481   case AMDGPU::BRANCH:
    482       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
    483               .addOperand(MI->getOperand(0));
    484       break;
    485 
    486   case AMDGPU::BRANCH_COND_f32: {
    487     MachineInstr *NewMI =
    488       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
    489               AMDGPU::PREDICATE_BIT)
    490               .addOperand(MI->getOperand(1))
    491               .addImm(OPCODE_IS_NOT_ZERO)
    492               .addImm(0); // Flags
    493     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
    494     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
    495             .addOperand(MI->getOperand(0))
    496             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
    497     break;
    498   }
    499 
    500   case AMDGPU::BRANCH_COND_i32: {
    501     MachineInstr *NewMI =
    502       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
    503             AMDGPU::PREDICATE_BIT)
    504             .addOperand(MI->getOperand(1))
    505             .addImm(OPCODE_IS_NOT_ZERO_INT)
    506             .addImm(0); // Flags
    507     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
    508     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
    509            .addOperand(MI->getOperand(0))
    510             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
    511     break;
    512   }
    513 
    514   case AMDGPU::EG_ExportSwz:
    515   case AMDGPU::R600_ExportSwz: {
    516     // Instruction is left unmodified if its not the last one of its type
    517     bool isLastInstructionOfItsType = true;
    518     unsigned InstExportType = MI->getOperand(1).getImm();
    519     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
    520          EndBlock = BB->end(); NextExportInst != EndBlock;
    521          NextExportInst = std::next(NextExportInst)) {
    522       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
    523           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
    524         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
    525             .getImm();
    526         if (CurrentInstExportType == InstExportType) {
    527           isLastInstructionOfItsType = false;
    528           break;
    529         }
    530       }
    531     }
    532     bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
    533     if (!EOP && !isLastInstructionOfItsType)
    534       return BB;
    535     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
    536     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
    537             .addOperand(MI->getOperand(0))
    538             .addOperand(MI->getOperand(1))
    539             .addOperand(MI->getOperand(2))
    540             .addOperand(MI->getOperand(3))
    541             .addOperand(MI->getOperand(4))
    542             .addOperand(MI->getOperand(5))
    543             .addOperand(MI->getOperand(6))
    544             .addImm(CfInst)
    545             .addImm(EOP);
    546     break;
    547   }
    548   case AMDGPU::RETURN: {
    549     // RETURN instructions must have the live-out registers as implicit uses,
    550     // otherwise they appear dead.
    551     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
    552     MachineInstrBuilder MIB(*MF, MI);
    553     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
    554       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
    555     return BB;
    556   }
    557   }
    558 
    559   MI->eraseFromParent();
    560   return BB;
    561 }
    562 
    563 //===----------------------------------------------------------------------===//
    564 // Custom DAG Lowering Operations
    565 //===----------------------------------------------------------------------===//
    566 
    567 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    568   MachineFunction &MF = DAG.getMachineFunction();
    569   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
    570   switch (Op.getOpcode()) {
    571   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
    572   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
    573   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
    574   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
    575   case ISD::SRA_PARTS:
    576   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
    577   case ISD::FCOS:
    578   case ISD::FSIN: return LowerTrig(Op, DAG);
    579   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
    580   case ISD::STORE: return LowerSTORE(Op, DAG);
    581   case ISD::LOAD: {
    582     SDValue Result = LowerLOAD(Op, DAG);
    583     assert((!Result.getNode() ||
    584             Result.getNode()->getNumValues() == 2) &&
    585            "Load should return a value and a chain");
    586     return Result;
    587   }
    588 
    589   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
    590   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
    591   case ISD::INTRINSIC_VOID: {
    592     SDValue Chain = Op.getOperand(0);
    593     unsigned IntrinsicID =
    594                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    595     switch (IntrinsicID) {
    596     case AMDGPUIntrinsic::AMDGPU_store_output: {
    597       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
    598       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
    599       MFI->LiveOuts.push_back(Reg);
    600       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
    601     }
    602     case AMDGPUIntrinsic::R600_store_swizzle: {
    603       const SDValue Args[8] = {
    604         Chain,
    605         Op.getOperand(2), // Export Value
    606         Op.getOperand(3), // ArrayBase
    607         Op.getOperand(4), // Type
    608         DAG.getConstant(0, MVT::i32), // SWZ_X
    609         DAG.getConstant(1, MVT::i32), // SWZ_Y
    610         DAG.getConstant(2, MVT::i32), // SWZ_Z
    611         DAG.getConstant(3, MVT::i32) // SWZ_W
    612       };
    613       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
    614     }
    615 
    616     // default for switch(IntrinsicID)
    617     default: break;
    618     }
    619     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
    620     break;
    621   }
    622   case ISD::INTRINSIC_WO_CHAIN: {
    623     unsigned IntrinsicID =
    624                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    625     EVT VT = Op.getValueType();
    626     SDLoc DL(Op);
    627     switch(IntrinsicID) {
    628     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
    629     case AMDGPUIntrinsic::R600_load_input: {
    630       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    631       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
    632       MachineFunction &MF = DAG.getMachineFunction();
    633       MachineRegisterInfo &MRI = MF.getRegInfo();
    634       MRI.addLiveIn(Reg);
    635       return DAG.getCopyFromReg(DAG.getEntryNode(),
    636           SDLoc(DAG.getEntryNode()), Reg, VT);
    637     }
    638 
    639     case AMDGPUIntrinsic::R600_interp_input: {
    640       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    641       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
    642       MachineSDNode *interp;
    643       if (ijb < 0) {
    644         const MachineFunction &MF = DAG.getMachineFunction();
    645         const R600InstrInfo *TII =
    646           static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
    647         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
    648             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
    649         return DAG.getTargetExtractSubreg(
    650             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
    651             DL, MVT::f32, SDValue(interp, 0));
    652       }
    653       MachineFunction &MF = DAG.getMachineFunction();
    654       MachineRegisterInfo &MRI = MF.getRegInfo();
    655       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
    656       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
    657       MRI.addLiveIn(RegisterI);
    658       MRI.addLiveIn(RegisterJ);
    659       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
    660           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
    661       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
    662           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
    663 
    664       if (slot % 4 < 2)
    665         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
    666             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
    667             RegisterJNode, RegisterINode);
    668       else
    669         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
    670             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
    671             RegisterJNode, RegisterINode);
    672       return SDValue(interp, slot % 2);
    673     }
    674     case AMDGPUIntrinsic::R600_interp_xy:
    675     case AMDGPUIntrinsic::R600_interp_zw: {
    676       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    677       MachineSDNode *interp;
    678       SDValue RegisterINode = Op.getOperand(2);
    679       SDValue RegisterJNode = Op.getOperand(3);
    680 
    681       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
    682         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
    683             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
    684             RegisterJNode, RegisterINode);
    685       else
    686         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
    687             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
    688             RegisterJNode, RegisterINode);
    689       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
    690           SDValue(interp, 0), SDValue(interp, 1));
    691     }
    692     case AMDGPUIntrinsic::R600_tex:
    693     case AMDGPUIntrinsic::R600_texc:
    694     case AMDGPUIntrinsic::R600_txl:
    695     case AMDGPUIntrinsic::R600_txlc:
    696     case AMDGPUIntrinsic::R600_txb:
    697     case AMDGPUIntrinsic::R600_txbc:
    698     case AMDGPUIntrinsic::R600_txf:
    699     case AMDGPUIntrinsic::R600_txq:
    700     case AMDGPUIntrinsic::R600_ddx:
    701     case AMDGPUIntrinsic::R600_ddy:
    702     case AMDGPUIntrinsic::R600_ldptr: {
    703       unsigned TextureOp;
    704       switch (IntrinsicID) {
    705       case AMDGPUIntrinsic::R600_tex:
    706         TextureOp = 0;
    707         break;
    708       case AMDGPUIntrinsic::R600_texc:
    709         TextureOp = 1;
    710         break;
    711       case AMDGPUIntrinsic::R600_txl:
    712         TextureOp = 2;
    713         break;
    714       case AMDGPUIntrinsic::R600_txlc:
    715         TextureOp = 3;
    716         break;
    717       case AMDGPUIntrinsic::R600_txb:
    718         TextureOp = 4;
    719         break;
    720       case AMDGPUIntrinsic::R600_txbc:
    721         TextureOp = 5;
    722         break;
    723       case AMDGPUIntrinsic::R600_txf:
    724         TextureOp = 6;
    725         break;
    726       case AMDGPUIntrinsic::R600_txq:
    727         TextureOp = 7;
    728         break;
    729       case AMDGPUIntrinsic::R600_ddx:
    730         TextureOp = 8;
    731         break;
    732       case AMDGPUIntrinsic::R600_ddy:
    733         TextureOp = 9;
    734         break;
    735       case AMDGPUIntrinsic::R600_ldptr:
    736         TextureOp = 10;
    737         break;
    738       default:
    739         llvm_unreachable("Unknow Texture Operation");
    740       }
    741 
    742       SDValue TexArgs[19] = {
    743         DAG.getConstant(TextureOp, MVT::i32),
    744         Op.getOperand(1),
    745         DAG.getConstant(0, MVT::i32),
    746         DAG.getConstant(1, MVT::i32),
    747         DAG.getConstant(2, MVT::i32),
    748         DAG.getConstant(3, MVT::i32),
    749         Op.getOperand(2),
    750         Op.getOperand(3),
    751         Op.getOperand(4),
    752         DAG.getConstant(0, MVT::i32),
    753         DAG.getConstant(1, MVT::i32),
    754         DAG.getConstant(2, MVT::i32),
    755         DAG.getConstant(3, MVT::i32),
    756         Op.getOperand(5),
    757         Op.getOperand(6),
    758         Op.getOperand(7),
    759         Op.getOperand(8),
    760         Op.getOperand(9),
    761         Op.getOperand(10)
    762       };
    763       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
    764     }
    765     case AMDGPUIntrinsic::AMDGPU_dp4: {
    766       SDValue Args[8] = {
    767       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
    768           DAG.getConstant(0, MVT::i32)),
    769       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
    770           DAG.getConstant(0, MVT::i32)),
    771       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
    772           DAG.getConstant(1, MVT::i32)),
    773       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
    774           DAG.getConstant(1, MVT::i32)),
    775       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
    776           DAG.getConstant(2, MVT::i32)),
    777       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
    778           DAG.getConstant(2, MVT::i32)),
    779       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
    780           DAG.getConstant(3, MVT::i32)),
    781       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
    782           DAG.getConstant(3, MVT::i32))
    783       };
    784       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
    785     }
    786 
    787     case Intrinsic::r600_read_ngroups_x:
    788       return LowerImplicitParameter(DAG, VT, DL, 0);
    789     case Intrinsic::r600_read_ngroups_y:
    790       return LowerImplicitParameter(DAG, VT, DL, 1);
    791     case Intrinsic::r600_read_ngroups_z:
    792       return LowerImplicitParameter(DAG, VT, DL, 2);
    793     case Intrinsic::r600_read_global_size_x:
    794       return LowerImplicitParameter(DAG, VT, DL, 3);
    795     case Intrinsic::r600_read_global_size_y:
    796       return LowerImplicitParameter(DAG, VT, DL, 4);
    797     case Intrinsic::r600_read_global_size_z:
    798       return LowerImplicitParameter(DAG, VT, DL, 5);
    799     case Intrinsic::r600_read_local_size_x:
    800       return LowerImplicitParameter(DAG, VT, DL, 6);
    801     case Intrinsic::r600_read_local_size_y:
    802       return LowerImplicitParameter(DAG, VT, DL, 7);
    803     case Intrinsic::r600_read_local_size_z:
    804       return LowerImplicitParameter(DAG, VT, DL, 8);
    805 
    806     case Intrinsic::r600_read_tgid_x:
    807       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    808                                   AMDGPU::T1_X, VT);
    809     case Intrinsic::r600_read_tgid_y:
    810       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    811                                   AMDGPU::T1_Y, VT);
    812     case Intrinsic::r600_read_tgid_z:
    813       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    814                                   AMDGPU::T1_Z, VT);
    815     case Intrinsic::r600_read_tidig_x:
    816       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    817                                   AMDGPU::T0_X, VT);
    818     case Intrinsic::r600_read_tidig_y:
    819       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    820                                   AMDGPU::T0_Y, VT);
    821     case Intrinsic::r600_read_tidig_z:
    822       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    823                                   AMDGPU::T0_Z, VT);
    824     case Intrinsic::AMDGPU_rsq:
    825       // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
    826       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
    827     }
    828     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
    829     break;
    830   }
    831   } // end switch(Op.getOpcode())
    832   return SDValue();
    833 }
    834 
    835 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
    836                                             SmallVectorImpl<SDValue> &Results,
    837                                             SelectionDAG &DAG) const {
    838   switch (N->getOpcode()) {
    839   default:
    840     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
    841     return;
    842   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
    843     return;
    844   case ISD::UDIV: {
    845     SDValue Op = SDValue(N, 0);
    846     SDLoc DL(Op);
    847     EVT VT = Op.getValueType();
    848     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
    849       N->getOperand(0), N->getOperand(1));
    850     Results.push_back(UDIVREM);
    851     break;
    852   }
    853   case ISD::UREM: {
    854     SDValue Op = SDValue(N, 0);
    855     SDLoc DL(Op);
    856     EVT VT = Op.getValueType();
    857     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
    858       N->getOperand(0), N->getOperand(1));
    859     Results.push_back(UDIVREM.getValue(1));
    860     break;
    861   }
    862   case ISD::SDIV: {
    863     SDValue Op = SDValue(N, 0);
    864     SDLoc DL(Op);
    865     EVT VT = Op.getValueType();
    866     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
    867       N->getOperand(0), N->getOperand(1));
    868     Results.push_back(SDIVREM);
    869     break;
    870   }
    871   case ISD::SREM: {
    872     SDValue Op = SDValue(N, 0);
    873     SDLoc DL(Op);
    874     EVT VT = Op.getValueType();
    875     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
    876       N->getOperand(0), N->getOperand(1));
    877     Results.push_back(SDIVREM.getValue(1));
    878     break;
    879   }
    880   case ISD::SDIVREM: {
    881     SDValue Op = SDValue(N, 1);
    882     SDValue RES = LowerSDIVREM(Op, DAG);
    883     Results.push_back(RES);
    884     Results.push_back(RES.getValue(1));
    885     break;
    886   }
    887   case ISD::UDIVREM: {
    888     SDValue Op = SDValue(N, 0);
    889     SDLoc DL(Op);
    890     EVT VT = Op.getValueType();
    891     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
    892 
    893     SDValue one = DAG.getConstant(1, HalfVT);
    894     SDValue zero = DAG.getConstant(0, HalfVT);
    895 
    896     //HiLo split
    897     SDValue LHS = N->getOperand(0);
    898     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
    899     SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
    900 
    901     SDValue RHS = N->getOperand(1);
    902     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
    903     SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
    904 
    905     // Get Speculative values
    906     SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
    907     SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
    908 
    909     SDValue REM_Hi = zero;
    910     SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
    911 
    912     SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
    913     SDValue DIV_Lo = zero;
    914 
    915     const unsigned halfBitWidth = HalfVT.getSizeInBits();
    916 
    917     for (unsigned i = 0; i < halfBitWidth; ++i) {
    918       SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
    919       // Get Value of high bit
    920       SDValue HBit;
    921       if (halfBitWidth == 32 && Subtarget->hasBFE()) {
    922         HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
    923       } else {
    924         HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
    925         HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
    926       }
    927 
    928       SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
    929         DAG.getConstant(halfBitWidth - 1, HalfVT));
    930       REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
    931       REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
    932 
    933       REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
    934       REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
    935 
    936 
    937       SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
    938 
    939       SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
    940       SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE);
    941 
    942       DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
    943 
    944       // Update REM
    945 
    946       SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
    947 
    948       REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE);
    949       REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
    950       REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
    951     }
    952 
    953     SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
    954     SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
    955     Results.push_back(DIV);
    956     Results.push_back(REM);
    957     break;
    958   }
    959   }
    960 }
    961 
    962 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
    963                                                    SDValue Vector) const {
    964 
    965   SDLoc DL(Vector);
    966   EVT VecVT = Vector.getValueType();
    967   EVT EltVT = VecVT.getVectorElementType();
    968   SmallVector<SDValue, 8> Args;
    969 
    970   for (unsigned i = 0, e = VecVT.getVectorNumElements();
    971                                                            i != e; ++i) {
    972     Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
    973                                Vector, DAG.getConstant(i, getVectorIdxTy())));
    974   }
    975 
    976   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
    977 }
    978 
    979 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
    980                                                     SelectionDAG &DAG) const {
    981 
    982   SDLoc DL(Op);
    983   SDValue Vector = Op.getOperand(0);
    984   SDValue Index = Op.getOperand(1);
    985 
    986   if (isa<ConstantSDNode>(Index) ||
    987       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
    988     return Op;
    989 
    990   Vector = vectorToVerticalVector(DAG, Vector);
    991   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
    992                      Vector, Index);
    993 }
    994 
    995 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
    996                                                    SelectionDAG &DAG) const {
    997   SDLoc DL(Op);
    998   SDValue Vector = Op.getOperand(0);
    999   SDValue Value = Op.getOperand(1);
   1000   SDValue Index = Op.getOperand(2);
   1001 
   1002   if (isa<ConstantSDNode>(Index) ||
   1003       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
   1004     return Op;
   1005 
   1006   Vector = vectorToVerticalVector(DAG, Vector);
   1007   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
   1008                                Vector, Value, Index);
   1009   return vectorToVerticalVector(DAG, Insert);
   1010 }
   1011 
   1012 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
   1013   // On hw >= R700, COS/SIN input must be between -1. and 1.
   1014   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
   1015   EVT VT = Op.getValueType();
   1016   SDValue Arg = Op.getOperand(0);
   1017   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
   1018       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
   1019         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
   1020           DAG.getConstantFP(0.15915494309, MVT::f32)),
   1021         DAG.getConstantFP(0.5, MVT::f32)));
   1022   unsigned TrigNode;
   1023   switch (Op.getOpcode()) {
   1024   case ISD::FCOS:
   1025     TrigNode = AMDGPUISD::COS_HW;
   1026     break;
   1027   case ISD::FSIN:
   1028     TrigNode = AMDGPUISD::SIN_HW;
   1029     break;
   1030   default:
   1031     llvm_unreachable("Wrong trig opcode");
   1032   }
   1033   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
   1034       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
   1035         DAG.getConstantFP(-0.5, MVT::f32)));
   1036   if (Gen >= AMDGPUSubtarget::R700)
   1037     return TrigVal;
   1038   // On R600 hw, COS/SIN input must be between -Pi and Pi.
   1039   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
   1040       DAG.getConstantFP(3.14159265359, MVT::f32));
   1041 }
   1042 
   1043 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
   1044   SDLoc DL(Op);
   1045   EVT VT = Op.getValueType();
   1046 
   1047   SDValue Lo = Op.getOperand(0);
   1048   SDValue Hi = Op.getOperand(1);
   1049   SDValue Shift = Op.getOperand(2);
   1050   SDValue Zero = DAG.getConstant(0, VT);
   1051   SDValue One  = DAG.getConstant(1, VT);
   1052 
   1053   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
   1054   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
   1055   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
   1056   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
   1057 
   1058   // The dance around Width1 is necessary for 0 special case.
   1059   // Without it the CompShift might be 32, producing incorrect results in
   1060   // Overflow. So we do the shift in two steps, the alternative is to
   1061   // add a conditional to filter the special case.
   1062 
   1063   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
   1064   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
   1065 
   1066   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
   1067   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
   1068   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
   1069 
   1070   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
   1071   SDValue LoBig = Zero;
   1072 
   1073   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
   1074   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
   1075 
   1076   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
   1077 }
   1078 
   1079 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
   1080   SDLoc DL(Op);
   1081   EVT VT = Op.getValueType();
   1082 
   1083   SDValue Lo = Op.getOperand(0);
   1084   SDValue Hi = Op.getOperand(1);
   1085   SDValue Shift = Op.getOperand(2);
   1086   SDValue Zero = DAG.getConstant(0, VT);
   1087   SDValue One  = DAG.getConstant(1, VT);
   1088 
   1089   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
   1090 
   1091   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
   1092   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
   1093   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
   1094   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
   1095 
   1096   // The dance around Width1 is necessary for 0 special case.
   1097   // Without it the CompShift might be 32, producing incorrect results in
   1098   // Overflow. So we do the shift in two steps, the alternative is to
   1099   // add a conditional to filter the special case.
   1100 
   1101   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
   1102   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
   1103 
   1104   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
   1105   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
   1106   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
   1107 
   1108   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
   1109   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
   1110 
   1111   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
   1112   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
   1113 
   1114   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
   1115 }
   1116 
   1117 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
   1118   return DAG.getNode(
   1119       ISD::SETCC,
   1120       SDLoc(Op),
   1121       MVT::i1,
   1122       Op, DAG.getConstantFP(0.0f, MVT::f32),
   1123       DAG.getCondCode(ISD::SETNE)
   1124       );
   1125 }
   1126 
   1127 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
   1128                                                    SDLoc DL,
   1129                                                    unsigned DwordOffset) const {
   1130   unsigned ByteOffset = DwordOffset * 4;
   1131   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
   1132                                       AMDGPUAS::CONSTANT_BUFFER_0);
   1133 
   1134   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
   1135   assert(isInt<16>(ByteOffset));
   1136 
   1137   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
   1138                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
   1139                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
   1140                      false, false, false, 0);
   1141 }
   1142 
   1143 bool R600TargetLowering::isZero(SDValue Op) const {
   1144   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
   1145     return Cst->isNullValue();
   1146   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
   1147     return CstFP->isZero();
   1148   } else {
   1149     return false;
   1150   }
   1151 }
   1152 
   1153 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   1154   SDLoc DL(Op);
   1155   EVT VT = Op.getValueType();
   1156 
   1157   SDValue LHS = Op.getOperand(0);
   1158   SDValue RHS = Op.getOperand(1);
   1159   SDValue True = Op.getOperand(2);
   1160   SDValue False = Op.getOperand(3);
   1161   SDValue CC = Op.getOperand(4);
   1162   SDValue Temp;
   1163 
   1164   // LHS and RHS are guaranteed to be the same value type
   1165   EVT CompareVT = LHS.getValueType();
   1166 
   1167   // Check if we can lower this to a native operation.
   1168 
   1169   // Try to lower to a SET* instruction:
   1170   //
   1171   // SET* can match the following patterns:
   1172   //
   1173   // select_cc f32, f32, -1,  0, cc_supported
   1174   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
   1175   // select_cc i32, i32, -1,  0, cc_supported
   1176   //
   1177 
   1178   // Move hardware True/False values to the correct operand.
   1179   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
   1180   ISD::CondCode InverseCC =
   1181      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
   1182   if (isHWTrueValue(False) && isHWFalseValue(True)) {
   1183     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
   1184       std::swap(False, True);
   1185       CC = DAG.getCondCode(InverseCC);
   1186     } else {
   1187       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
   1188       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
   1189         std::swap(False, True);
   1190         std::swap(LHS, RHS);
   1191         CC = DAG.getCondCode(SwapInvCC);
   1192       }
   1193     }
   1194   }
   1195 
   1196   if (isHWTrueValue(True) && isHWFalseValue(False) &&
   1197       (CompareVT == VT || VT == MVT::i32)) {
   1198     // This can be matched by a SET* instruction.
   1199     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
   1200   }
   1201 
   1202   // Try to lower to a CND* instruction:
   1203   //
   1204   // CND* can match the following patterns:
   1205   //
   1206   // select_cc f32, 0.0, f32, f32, cc_supported
   1207   // select_cc f32, 0.0, i32, i32, cc_supported
   1208   // select_cc i32, 0,   f32, f32, cc_supported
   1209   // select_cc i32, 0,   i32, i32, cc_supported
   1210   //
   1211 
   1212   // Try to move the zero value to the RHS
   1213   if (isZero(LHS)) {
   1214     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
   1215     // Try swapping the operands
   1216     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
   1217     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
   1218       std::swap(LHS, RHS);
   1219       CC = DAG.getCondCode(CCSwapped);
   1220     } else {
   1221       // Try inverting the conditon and then swapping the operands
   1222       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
   1223       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
   1224       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
   1225         std::swap(True, False);
   1226         std::swap(LHS, RHS);
   1227         CC = DAG.getCondCode(CCSwapped);
   1228       }
   1229     }
   1230   }
   1231   if (isZero(RHS)) {
   1232     SDValue Cond = LHS;
   1233     SDValue Zero = RHS;
   1234     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
   1235     if (CompareVT != VT) {
   1236       // Bitcast True / False to the correct types.  This will end up being
   1237       // a nop, but it allows us to define only a single pattern in the
   1238       // .TD files for each CND* instruction rather than having to have
   1239       // one pattern for integer True/False and one for fp True/False
   1240       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
   1241       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
   1242     }
   1243 
   1244     switch (CCOpcode) {
   1245     case ISD::SETONE:
   1246     case ISD::SETUNE:
   1247     case ISD::SETNE:
   1248       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
   1249       Temp = True;
   1250       True = False;
   1251       False = Temp;
   1252       break;
   1253     default:
   1254       break;
   1255     }
   1256     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
   1257         Cond, Zero,
   1258         True, False,
   1259         DAG.getCondCode(CCOpcode));
   1260     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
   1261   }
   1262 
   1263   // If we make it this for it means we have no native instructions to handle
   1264   // this SELECT_CC, so we must lower it.
   1265   SDValue HWTrue, HWFalse;
   1266 
   1267   if (CompareVT == MVT::f32) {
   1268     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
   1269     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
   1270   } else if (CompareVT == MVT::i32) {
   1271     HWTrue = DAG.getConstant(-1, CompareVT);
   1272     HWFalse = DAG.getConstant(0, CompareVT);
   1273   }
   1274   else {
   1275     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
   1276   }
   1277 
   1278   // Lower this unsupported SELECT_CC into a combination of two supported
   1279   // SELECT_CC operations.
   1280   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
   1281 
   1282   return DAG.getNode(ISD::SELECT_CC, DL, VT,
   1283       Cond, HWFalse,
   1284       True, False,
   1285       DAG.getCondCode(ISD::SETNE));
   1286 }
   1287 
   1288 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
   1289 /// convert these pointers to a register index.  Each register holds
   1290 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
   1291 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
   1292 /// for indirect addressing.
   1293 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
   1294                                                unsigned StackWidth,
   1295                                                SelectionDAG &DAG) const {
   1296   unsigned SRLPad;
   1297   switch(StackWidth) {
   1298   case 1:
   1299     SRLPad = 2;
   1300     break;
   1301   case 2:
   1302     SRLPad = 3;
   1303     break;
   1304   case 4:
   1305     SRLPad = 4;
   1306     break;
   1307   default: llvm_unreachable("Invalid stack width");
   1308   }
   1309 
   1310   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
   1311                      DAG.getConstant(SRLPad, MVT::i32));
   1312 }
   1313 
   1314 void R600TargetLowering::getStackAddress(unsigned StackWidth,
   1315                                          unsigned ElemIdx,
   1316                                          unsigned &Channel,
   1317                                          unsigned &PtrIncr) const {
   1318   switch (StackWidth) {
   1319   default:
   1320   case 1:
   1321     Channel = 0;
   1322     if (ElemIdx > 0) {
   1323       PtrIncr = 1;
   1324     } else {
   1325       PtrIncr = 0;
   1326     }
   1327     break;
   1328   case 2:
   1329     Channel = ElemIdx % 2;
   1330     if (ElemIdx == 2) {
   1331       PtrIncr = 1;
   1332     } else {
   1333       PtrIncr = 0;
   1334     }
   1335     break;
   1336   case 4:
   1337     Channel = ElemIdx;
   1338     PtrIncr = 0;
   1339     break;
   1340   }
   1341 }
   1342 
   1343 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   1344   SDLoc DL(Op);
   1345   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
   1346   SDValue Chain = Op.getOperand(0);
   1347   SDValue Value = Op.getOperand(1);
   1348   SDValue Ptr = Op.getOperand(2);
   1349 
   1350   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
   1351   if (Result.getNode()) {
   1352     return Result;
   1353   }
   1354 
   1355   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
   1356     if (StoreNode->isTruncatingStore()) {
   1357       EVT VT = Value.getValueType();
   1358       assert(VT.bitsLE(MVT::i32));
   1359       EVT MemVT = StoreNode->getMemoryVT();
   1360       SDValue MaskConstant;
   1361       if (MemVT == MVT::i8) {
   1362         MaskConstant = DAG.getConstant(0xFF, MVT::i32);
   1363       } else {
   1364         assert(MemVT == MVT::i16);
   1365         MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
   1366       }
   1367       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
   1368                                       DAG.getConstant(2, MVT::i32));
   1369       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
   1370                                       DAG.getConstant(0x00000003, VT));
   1371       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
   1372       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
   1373                                    DAG.getConstant(3, VT));
   1374       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
   1375       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
   1376       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
   1377       // vector instead.
   1378       SDValue Src[4] = {
   1379         ShiftedValue,
   1380         DAG.getConstant(0, MVT::i32),
   1381         DAG.getConstant(0, MVT::i32),
   1382         Mask
   1383       };
   1384       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
   1385       SDValue Args[3] = { Chain, Input, DWordAddr };
   1386       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
   1387                                      Op->getVTList(), Args, MemVT,
   1388                                      StoreNode->getMemOperand());
   1389     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
   1390                Value.getValueType().bitsGE(MVT::i32)) {
   1391       // Convert pointer from byte address to dword address.
   1392       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
   1393                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
   1394                                     Ptr, DAG.getConstant(2, MVT::i32)));
   1395 
   1396       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
   1397         llvm_unreachable("Truncated and indexed stores not supported yet");
   1398       } else {
   1399         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
   1400       }
   1401       return Chain;
   1402     }
   1403   }
   1404 
   1405   EVT ValueVT = Value.getValueType();
   1406 
   1407   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
   1408     return SDValue();
   1409   }
   1410 
   1411   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
   1412   if (Ret.getNode()) {
   1413     return Ret;
   1414   }
   1415   // Lowering for indirect addressing
   1416 
   1417   const MachineFunction &MF = DAG.getMachineFunction();
   1418   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
   1419                                          getTargetMachine().getFrameLowering());
   1420   unsigned StackWidth = TFL->getStackWidth(MF);
   1421 
   1422   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
   1423 
   1424   if (ValueVT.isVector()) {
   1425     unsigned NumElemVT = ValueVT.getVectorNumElements();
   1426     EVT ElemVT = ValueVT.getVectorElementType();
   1427     SmallVector<SDValue, 4> Stores(NumElemVT);
   1428 
   1429     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
   1430                                       "vector width in load");
   1431 
   1432     for (unsigned i = 0; i < NumElemVT; ++i) {
   1433       unsigned Channel, PtrIncr;
   1434       getStackAddress(StackWidth, i, Channel, PtrIncr);
   1435       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
   1436                         DAG.getConstant(PtrIncr, MVT::i32));
   1437       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
   1438                                  Value, DAG.getConstant(i, MVT::i32));
   1439 
   1440       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
   1441                               Chain, Elem, Ptr,
   1442                               DAG.getTargetConstant(Channel, MVT::i32));
   1443     }
   1444      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
   1445    } else {
   1446     if (ValueVT == MVT::i8) {
   1447       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
   1448     }
   1449     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
   1450     DAG.getTargetConstant(0, MVT::i32)); // Channel
   1451   }
   1452 
   1453   return Chain;
   1454 }
   1455 
   1456 // return (512 + (kc_bank << 12)
   1457 static int
   1458 ConstantAddressBlock(unsigned AddressSpace) {
   1459   switch (AddressSpace) {
   1460   case AMDGPUAS::CONSTANT_BUFFER_0:
   1461     return 512;
   1462   case AMDGPUAS::CONSTANT_BUFFER_1:
   1463     return 512 + 4096;
   1464   case AMDGPUAS::CONSTANT_BUFFER_2:
   1465     return 512 + 4096 * 2;
   1466   case AMDGPUAS::CONSTANT_BUFFER_3:
   1467     return 512 + 4096 * 3;
   1468   case AMDGPUAS::CONSTANT_BUFFER_4:
   1469     return 512 + 4096 * 4;
   1470   case AMDGPUAS::CONSTANT_BUFFER_5:
   1471     return 512 + 4096 * 5;
   1472   case AMDGPUAS::CONSTANT_BUFFER_6:
   1473     return 512 + 4096 * 6;
   1474   case AMDGPUAS::CONSTANT_BUFFER_7:
   1475     return 512 + 4096 * 7;
   1476   case AMDGPUAS::CONSTANT_BUFFER_8:
   1477     return 512 + 4096 * 8;
   1478   case AMDGPUAS::CONSTANT_BUFFER_9:
   1479     return 512 + 4096 * 9;
   1480   case AMDGPUAS::CONSTANT_BUFFER_10:
   1481     return 512 + 4096 * 10;
   1482   case AMDGPUAS::CONSTANT_BUFFER_11:
   1483     return 512 + 4096 * 11;
   1484   case AMDGPUAS::CONSTANT_BUFFER_12:
   1485     return 512 + 4096 * 12;
   1486   case AMDGPUAS::CONSTANT_BUFFER_13:
   1487     return 512 + 4096 * 13;
   1488   case AMDGPUAS::CONSTANT_BUFFER_14:
   1489     return 512 + 4096 * 14;
   1490   case AMDGPUAS::CONSTANT_BUFFER_15:
   1491     return 512 + 4096 * 15;
   1492   default:
   1493     return -1;
   1494   }
   1495 }
   1496 
   1497 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
   1498 {
   1499   EVT VT = Op.getValueType();
   1500   SDLoc DL(Op);
   1501   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
   1502   SDValue Chain = Op.getOperand(0);
   1503   SDValue Ptr = Op.getOperand(1);
   1504   SDValue LoweredLoad;
   1505 
   1506   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
   1507   if (Ret.getNode()) {
   1508     SDValue Ops[2] = {
   1509       Ret,
   1510       Chain
   1511     };
   1512     return DAG.getMergeValues(Ops, DL);
   1513   }
   1514 
   1515 
   1516   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
   1517     SDValue MergedValues[2] = {
   1518       SplitVectorLoad(Op, DAG),
   1519       Chain
   1520     };
   1521     return DAG.getMergeValues(MergedValues, DL);
   1522   }
   1523 
   1524   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
   1525   if (ConstantBlock > -1 &&
   1526       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
   1527        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
   1528     SDValue Result;
   1529     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
   1530         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
   1531         isa<ConstantSDNode>(Ptr)) {
   1532       SDValue Slots[4];
   1533       for (unsigned i = 0; i < 4; i++) {
   1534         // We want Const position encoded with the following formula :
   1535         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
   1536         // const_index is Ptr computed by llvm using an alignment of 16.
   1537         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
   1538         // then div by 4 at the ISel step
   1539         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
   1540             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
   1541         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
   1542       }
   1543       EVT NewVT = MVT::v4i32;
   1544       unsigned NumElements = 4;
   1545       if (VT.isVector()) {
   1546         NewVT = VT;
   1547         NumElements = VT.getVectorNumElements();
   1548       }
   1549       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
   1550                            makeArrayRef(Slots, NumElements));
   1551     } else {
   1552       // non-constant ptr can't be folded, keeps it as a v4f32 load
   1553       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
   1554           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
   1555           DAG.getConstant(LoadNode->getAddressSpace() -
   1556                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
   1557           );
   1558     }
   1559 
   1560     if (!VT.isVector()) {
   1561       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
   1562           DAG.getConstant(0, MVT::i32));
   1563     }
   1564 
   1565     SDValue MergedValues[2] = {
   1566       Result,
   1567       Chain
   1568     };
   1569     return DAG.getMergeValues(MergedValues, DL);
   1570   }
   1571 
   1572   // For most operations returning SDValue() will result in the node being
   1573   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
   1574   // need to manually expand loads that may be legal in some address spaces and
   1575   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
   1576   // compute shaders, since the data is sign extended when it is uploaded to the
   1577   // buffer. However SEXT loads from other address spaces are not supported, so
   1578   // we need to expand them here.
   1579   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
   1580     EVT MemVT = LoadNode->getMemoryVT();
   1581     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
   1582     SDValue ShiftAmount =
   1583           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
   1584     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
   1585                                   LoadNode->getPointerInfo(), MemVT,
   1586                                   LoadNode->isVolatile(),
   1587                                   LoadNode->isNonTemporal(),
   1588                                   LoadNode->getAlignment());
   1589     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
   1590     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
   1591 
   1592     SDValue MergedValues[2] = { Sra, Chain };
   1593     return DAG.getMergeValues(MergedValues, DL);
   1594   }
   1595 
   1596   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
   1597     return SDValue();
   1598   }
   1599 
   1600   // Lowering for indirect addressing
   1601   const MachineFunction &MF = DAG.getMachineFunction();
   1602   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
   1603                                          getTargetMachine().getFrameLowering());
   1604   unsigned StackWidth = TFL->getStackWidth(MF);
   1605 
   1606   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
   1607 
   1608   if (VT.isVector()) {
   1609     unsigned NumElemVT = VT.getVectorNumElements();
   1610     EVT ElemVT = VT.getVectorElementType();
   1611     SDValue Loads[4];
   1612 
   1613     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
   1614                                       "vector width in load");
   1615 
   1616     for (unsigned i = 0; i < NumElemVT; ++i) {
   1617       unsigned Channel, PtrIncr;
   1618       getStackAddress(StackWidth, i, Channel, PtrIncr);
   1619       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
   1620                         DAG.getConstant(PtrIncr, MVT::i32));
   1621       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
   1622                              Chain, Ptr,
   1623                              DAG.getTargetConstant(Channel, MVT::i32),
   1624                              Op.getOperand(2));
   1625     }
   1626     for (unsigned i = NumElemVT; i < 4; ++i) {
   1627       Loads[i] = DAG.getUNDEF(ElemVT);
   1628     }
   1629     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
   1630     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
   1631   } else {
   1632     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
   1633                               Chain, Ptr,
   1634                               DAG.getTargetConstant(0, MVT::i32), // Channel
   1635                               Op.getOperand(2));
   1636   }
   1637 
   1638   SDValue Ops[2] = {
   1639     LoweredLoad,
   1640     Chain
   1641   };
   1642 
   1643   return DAG.getMergeValues(Ops, DL);
   1644 }
   1645 
   1646 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   1647   SDValue Chain = Op.getOperand(0);
   1648   SDValue Cond  = Op.getOperand(1);
   1649   SDValue Jump  = Op.getOperand(2);
   1650 
   1651   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
   1652                      Chain, Jump, Cond);
   1653 }
   1654 
   1655 /// XXX Only kernel functions are supported, so we can assume for now that
   1656 /// every function is a kernel function, but in the future we should use
   1657 /// separate calling conventions for kernel and non-kernel functions.
   1658 SDValue R600TargetLowering::LowerFormalArguments(
   1659                                       SDValue Chain,
   1660                                       CallingConv::ID CallConv,
   1661                                       bool isVarArg,
   1662                                       const SmallVectorImpl<ISD::InputArg> &Ins,
   1663                                       SDLoc DL, SelectionDAG &DAG,
   1664                                       SmallVectorImpl<SDValue> &InVals) const {
   1665   SmallVector<CCValAssign, 16> ArgLocs;
   1666   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   1667                  getTargetMachine(), ArgLocs, *DAG.getContext());
   1668   MachineFunction &MF = DAG.getMachineFunction();
   1669   unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->ShaderType;
   1670 
   1671   SmallVector<ISD::InputArg, 8> LocalIns;
   1672 
   1673   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
   1674 
   1675   AnalyzeFormalArguments(CCInfo, LocalIns);
   1676 
   1677   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
   1678     CCValAssign &VA = ArgLocs[i];
   1679     EVT VT = Ins[i].VT;
   1680     EVT MemVT = LocalIns[i].VT;
   1681 
   1682     if (ShaderType != ShaderType::COMPUTE) {
   1683       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
   1684       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
   1685       InVals.push_back(Register);
   1686       continue;
   1687     }
   1688 
   1689     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
   1690                                                    AMDGPUAS::CONSTANT_BUFFER_0);
   1691 
   1692     // i64 isn't a legal type, so the register type used ends up as i32, which
   1693     // isn't expected here. It attempts to create this sextload, but it ends up
   1694     // being invalid. Somehow this seems to work with i64 arguments, but breaks
   1695     // for <1 x i64>.
   1696 
   1697     // The first 36 bytes of the input buffer contains information about
   1698     // thread group and global sizes.
   1699 
   1700     // FIXME: This should really check the extload type, but the handling of
   1701     // extload vecto parameters seems to be broken.
   1702     //ISD::LoadExtType Ext = Ins[i].Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
   1703     ISD::LoadExtType Ext = ISD::SEXTLOAD;
   1704     SDValue Arg = DAG.getExtLoad(Ext, DL, VT, Chain,
   1705                                  DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
   1706                                  MachinePointerInfo(UndefValue::get(PtrTy)),
   1707                                  MemVT, false, false, 4);
   1708 
   1709     // 4 is the preferred alignment for the CONSTANT memory space.
   1710     InVals.push_back(Arg);
   1711   }
   1712   return Chain;
   1713 }
   1714 
   1715 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   1716    if (!VT.isVector())
   1717      return MVT::i32;
   1718    return VT.changeVectorElementTypeToInteger();
   1719 }
   1720 
   1721 static SDValue CompactSwizzlableVector(
   1722   SelectionDAG &DAG, SDValue VectorEntry,
   1723   DenseMap<unsigned, unsigned> &RemapSwizzle) {
   1724   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
   1725   assert(RemapSwizzle.empty());
   1726   SDValue NewBldVec[4] = {
   1727     VectorEntry.getOperand(0),
   1728     VectorEntry.getOperand(1),
   1729     VectorEntry.getOperand(2),
   1730     VectorEntry.getOperand(3)
   1731   };
   1732 
   1733   for (unsigned i = 0; i < 4; i++) {
   1734     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
   1735       // We mask write here to teach later passes that the ith element of this
   1736       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
   1737       // break false dependencies and additionnaly make assembly easier to read.
   1738       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
   1739     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
   1740       if (C->isZero()) {
   1741         RemapSwizzle[i] = 4; // SEL_0
   1742         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
   1743       } else if (C->isExactlyValue(1.0)) {
   1744         RemapSwizzle[i] = 5; // SEL_1
   1745         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
   1746       }
   1747     }
   1748 
   1749     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
   1750       continue;
   1751     for (unsigned j = 0; j < i; j++) {
   1752       if (NewBldVec[i] == NewBldVec[j]) {
   1753         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
   1754         RemapSwizzle[i] = j;
   1755         break;
   1756       }
   1757     }
   1758   }
   1759 
   1760   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
   1761                      VectorEntry.getValueType(), NewBldVec);
   1762 }
   1763 
   1764 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
   1765                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
   1766   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
   1767   assert(RemapSwizzle.empty());
   1768   SDValue NewBldVec[4] = {
   1769       VectorEntry.getOperand(0),
   1770       VectorEntry.getOperand(1),
   1771       VectorEntry.getOperand(2),
   1772       VectorEntry.getOperand(3)
   1773   };
   1774   bool isUnmovable[4] = { false, false, false, false };
   1775   for (unsigned i = 0; i < 4; i++) {
   1776     RemapSwizzle[i] = i;
   1777     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
   1778       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
   1779           ->getZExtValue();
   1780       if (i == Idx)
   1781         isUnmovable[Idx] = true;
   1782     }
   1783   }
   1784 
   1785   for (unsigned i = 0; i < 4; i++) {
   1786     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
   1787       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
   1788           ->getZExtValue();
   1789       if (isUnmovable[Idx])
   1790         continue;
   1791       // Swap i and Idx
   1792       std::swap(NewBldVec[Idx], NewBldVec[i]);
   1793       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
   1794       break;
   1795     }
   1796   }
   1797 
   1798   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
   1799                      VectorEntry.getValueType(), NewBldVec);
   1800 }
   1801 
   1802 
   1803 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
   1804 SDValue Swz[4], SelectionDAG &DAG) const {
   1805   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
   1806   // Old -> New swizzle values
   1807   DenseMap<unsigned, unsigned> SwizzleRemap;
   1808 
   1809   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
   1810   for (unsigned i = 0; i < 4; i++) {
   1811     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
   1812     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
   1813       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
   1814   }
   1815 
   1816   SwizzleRemap.clear();
   1817   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
   1818   for (unsigned i = 0; i < 4; i++) {
   1819     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
   1820     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
   1821       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
   1822   }
   1823 
   1824   return BuildVector;
   1825 }
   1826 
   1827 
   1828 //===----------------------------------------------------------------------===//
   1829 // Custom DAG Optimizations
   1830 //===----------------------------------------------------------------------===//
   1831 
   1832 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
   1833                                               DAGCombinerInfo &DCI) const {
   1834   SelectionDAG &DAG = DCI.DAG;
   1835 
   1836   switch (N->getOpcode()) {
   1837   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
   1838   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
   1839   case ISD::FP_ROUND: {
   1840       SDValue Arg = N->getOperand(0);
   1841       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
   1842         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
   1843                            Arg.getOperand(0));
   1844       }
   1845       break;
   1846     }
   1847 
   1848   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
   1849   // (i32 select_cc f32, f32, -1, 0 cc)
   1850   //
   1851   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
   1852   // this to one of the SET*_DX10 instructions.
   1853   case ISD::FP_TO_SINT: {
   1854     SDValue FNeg = N->getOperand(0);
   1855     if (FNeg.getOpcode() != ISD::FNEG) {
   1856       return SDValue();
   1857     }
   1858     SDValue SelectCC = FNeg.getOperand(0);
   1859     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
   1860         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
   1861         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
   1862         !isHWTrueValue(SelectCC.getOperand(2)) ||
   1863         !isHWFalseValue(SelectCC.getOperand(3))) {
   1864       return SDValue();
   1865     }
   1866 
   1867     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
   1868                            SelectCC.getOperand(0), // LHS
   1869                            SelectCC.getOperand(1), // RHS
   1870                            DAG.getConstant(-1, MVT::i32), // True
   1871                            DAG.getConstant(0, MVT::i32),  // Flase
   1872                            SelectCC.getOperand(4)); // CC
   1873 
   1874     break;
   1875   }
   1876 
   1877   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
   1878   // => build_vector elt0, ... , NewEltIdx, ... , eltN
   1879   case ISD::INSERT_VECTOR_ELT: {
   1880     SDValue InVec = N->getOperand(0);
   1881     SDValue InVal = N->getOperand(1);
   1882     SDValue EltNo = N->getOperand(2);
   1883     SDLoc dl(N);
   1884 
   1885     // If the inserted element is an UNDEF, just use the input vector.
   1886     if (InVal.getOpcode() == ISD::UNDEF)
   1887       return InVec;
   1888 
   1889     EVT VT = InVec.getValueType();
   1890 
   1891     // If we can't generate a legal BUILD_VECTOR, exit
   1892     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
   1893       return SDValue();
   1894 
   1895     // Check that we know which element is being inserted
   1896     if (!isa<ConstantSDNode>(EltNo))
   1897       return SDValue();
   1898     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
   1899 
   1900     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
   1901     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
   1902     // vector elements.
   1903     SmallVector<SDValue, 8> Ops;
   1904     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
   1905       Ops.append(InVec.getNode()->op_begin(),
   1906                  InVec.getNode()->op_end());
   1907     } else if (InVec.getOpcode() == ISD::UNDEF) {
   1908       unsigned NElts = VT.getVectorNumElements();
   1909       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
   1910     } else {
   1911       return SDValue();
   1912     }
   1913 
   1914     // Insert the element
   1915     if (Elt < Ops.size()) {
   1916       // All the operands of BUILD_VECTOR must have the same type;
   1917       // we enforce that here.
   1918       EVT OpVT = Ops[0].getValueType();
   1919       if (InVal.getValueType() != OpVT)
   1920         InVal = OpVT.bitsGT(InVal.getValueType()) ?
   1921           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
   1922           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
   1923       Ops[Elt] = InVal;
   1924     }
   1925 
   1926     // Return the new vector
   1927     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   1928   }
   1929 
   1930   // Extract_vec (Build_vector) generated by custom lowering
   1931   // also needs to be customly combined
   1932   case ISD::EXTRACT_VECTOR_ELT: {
   1933     SDValue Arg = N->getOperand(0);
   1934     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
   1935       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
   1936         unsigned Element = Const->getZExtValue();
   1937         return Arg->getOperand(Element);
   1938       }
   1939     }
   1940     if (Arg.getOpcode() == ISD::BITCAST &&
   1941         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
   1942       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
   1943         unsigned Element = Const->getZExtValue();
   1944         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
   1945             Arg->getOperand(0).getOperand(Element));
   1946       }
   1947     }
   1948   }
   1949 
   1950   case ISD::SELECT_CC: {
   1951     // Try common optimizations
   1952     SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
   1953     if (Ret.getNode())
   1954       return Ret;
   1955 
   1956     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
   1957     //      selectcc x, y, a, b, inv(cc)
   1958     //
   1959     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
   1960     //      selectcc x, y, a, b, cc
   1961     SDValue LHS = N->getOperand(0);
   1962     if (LHS.getOpcode() != ISD::SELECT_CC) {
   1963       return SDValue();
   1964     }
   1965 
   1966     SDValue RHS = N->getOperand(1);
   1967     SDValue True = N->getOperand(2);
   1968     SDValue False = N->getOperand(3);
   1969     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
   1970 
   1971     if (LHS.getOperand(2).getNode() != True.getNode() ||
   1972         LHS.getOperand(3).getNode() != False.getNode() ||
   1973         RHS.getNode() != False.getNode()) {
   1974       return SDValue();
   1975     }
   1976 
   1977     switch (NCC) {
   1978     default: return SDValue();
   1979     case ISD::SETNE: return LHS;
   1980     case ISD::SETEQ: {
   1981       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
   1982       LHSCC = ISD::getSetCCInverse(LHSCC,
   1983                                   LHS.getOperand(0).getValueType().isInteger());
   1984       if (DCI.isBeforeLegalizeOps() ||
   1985           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
   1986         return DAG.getSelectCC(SDLoc(N),
   1987                                LHS.getOperand(0),
   1988                                LHS.getOperand(1),
   1989                                LHS.getOperand(2),
   1990                                LHS.getOperand(3),
   1991                                LHSCC);
   1992       break;
   1993     }
   1994     }
   1995     return SDValue();
   1996   }
   1997 
   1998   case AMDGPUISD::EXPORT: {
   1999     SDValue Arg = N->getOperand(1);
   2000     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
   2001       break;
   2002 
   2003     SDValue NewArgs[8] = {
   2004       N->getOperand(0), // Chain
   2005       SDValue(),
   2006       N->getOperand(2), // ArrayBase
   2007       N->getOperand(3), // Type
   2008       N->getOperand(4), // SWZ_X
   2009       N->getOperand(5), // SWZ_Y
   2010       N->getOperand(6), // SWZ_Z
   2011       N->getOperand(7) // SWZ_W
   2012     };
   2013     SDLoc DL(N);
   2014     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
   2015     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
   2016   }
   2017   case AMDGPUISD::TEXTURE_FETCH: {
   2018     SDValue Arg = N->getOperand(1);
   2019     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
   2020       break;
   2021 
   2022     SDValue NewArgs[19] = {
   2023       N->getOperand(0),
   2024       N->getOperand(1),
   2025       N->getOperand(2),
   2026       N->getOperand(3),
   2027       N->getOperand(4),
   2028       N->getOperand(5),
   2029       N->getOperand(6),
   2030       N->getOperand(7),
   2031       N->getOperand(8),
   2032       N->getOperand(9),
   2033       N->getOperand(10),
   2034       N->getOperand(11),
   2035       N->getOperand(12),
   2036       N->getOperand(13),
   2037       N->getOperand(14),
   2038       N->getOperand(15),
   2039       N->getOperand(16),
   2040       N->getOperand(17),
   2041       N->getOperand(18),
   2042     };
   2043     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
   2044     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
   2045         NewArgs);
   2046   }
   2047   }
   2048 
   2049   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
   2050 }
   2051 
   2052 static bool
   2053 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
   2054             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
   2055   const R600InstrInfo *TII =
   2056       static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
   2057   if (!Src.isMachineOpcode())
   2058     return false;
   2059   switch (Src.getMachineOpcode()) {
   2060   case AMDGPU::FNEG_R600:
   2061     if (!Neg.getNode())
   2062       return false;
   2063     Src = Src.getOperand(0);
   2064     Neg = DAG.getTargetConstant(1, MVT::i32);
   2065     return true;
   2066   case AMDGPU::FABS_R600:
   2067     if (!Abs.getNode())
   2068       return false;
   2069     Src = Src.getOperand(0);
   2070     Abs = DAG.getTargetConstant(1, MVT::i32);
   2071     return true;
   2072   case AMDGPU::CONST_COPY: {
   2073     unsigned Opcode = ParentNode->getMachineOpcode();
   2074     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
   2075 
   2076     if (!Sel.getNode())
   2077       return false;
   2078 
   2079     SDValue CstOffset = Src.getOperand(0);
   2080     if (ParentNode->getValueType(0).isVector())
   2081       return false;
   2082 
   2083     // Gather constants values
   2084     int SrcIndices[] = {
   2085       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
   2086       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
   2087       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
   2088       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
   2089       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
   2090       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
   2091       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
   2092       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
   2093       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
   2094       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
   2095       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
   2096     };
   2097     std::vector<unsigned> Consts;
   2098     for (int OtherSrcIdx : SrcIndices) {
   2099       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
   2100       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
   2101         continue;
   2102       if (HasDst) {
   2103         OtherSrcIdx--;
   2104         OtherSelIdx--;
   2105       }
   2106       if (RegisterSDNode *Reg =
   2107           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
   2108         if (Reg->getReg() == AMDGPU::ALU_CONST) {
   2109           ConstantSDNode *Cst
   2110             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
   2111           Consts.push_back(Cst->getZExtValue());
   2112         }
   2113       }
   2114     }
   2115 
   2116     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
   2117     Consts.push_back(Cst->getZExtValue());
   2118     if (!TII->fitsConstReadLimitations(Consts)) {
   2119       return false;
   2120     }
   2121 
   2122     Sel = CstOffset;
   2123     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
   2124     return true;
   2125   }
   2126   case AMDGPU::MOV_IMM_I32:
   2127   case AMDGPU::MOV_IMM_F32: {
   2128     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
   2129     uint64_t ImmValue = 0;
   2130 
   2131 
   2132     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
   2133       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
   2134       float FloatValue = FPC->getValueAPF().convertToFloat();
   2135       if (FloatValue == 0.0) {
   2136         ImmReg = AMDGPU::ZERO;
   2137       } else if (FloatValue == 0.5) {
   2138         ImmReg = AMDGPU::HALF;
   2139       } else if (FloatValue == 1.0) {
   2140         ImmReg = AMDGPU::ONE;
   2141       } else {
   2142         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
   2143       }
   2144     } else {
   2145       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
   2146       uint64_t Value = C->getZExtValue();
   2147       if (Value == 0) {
   2148         ImmReg = AMDGPU::ZERO;
   2149       } else if (Value == 1) {
   2150         ImmReg = AMDGPU::ONE_INT;
   2151       } else {
   2152         ImmValue = Value;
   2153       }
   2154     }
   2155 
   2156     // Check that we aren't already using an immediate.
   2157     // XXX: It's possible for an instruction to have more than one
   2158     // immediate operand, but this is not supported yet.
   2159     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
   2160       if (!Imm.getNode())
   2161         return false;
   2162       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
   2163       assert(C);
   2164       if (C->getZExtValue())
   2165         return false;
   2166       Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
   2167     }
   2168     Src = DAG.getRegister(ImmReg, MVT::i32);
   2169     return true;
   2170   }
   2171   default:
   2172     return false;
   2173   }
   2174 }
   2175 
   2176 
   2177 /// \brief Fold the instructions after selecting them
   2178 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
   2179                                             SelectionDAG &DAG) const {
   2180   const R600InstrInfo *TII =
   2181       static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
   2182   if (!Node->isMachineOpcode())
   2183     return Node;
   2184   unsigned Opcode = Node->getMachineOpcode();
   2185   SDValue FakeOp;
   2186 
   2187   std::vector<SDValue> Ops;
   2188   for (const SDUse &I : Node->ops())
   2189     Ops.push_back(I);
   2190 
   2191   if (Opcode == AMDGPU::DOT_4) {
   2192     int OperandIdx[] = {
   2193       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
   2194       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
   2195       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
   2196       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
   2197       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
   2198       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
   2199       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
   2200       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
   2201         };
   2202     int NegIdx[] = {
   2203       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
   2204       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
   2205       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
   2206       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
   2207       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
   2208       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
   2209       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
   2210       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
   2211     };
   2212     int AbsIdx[] = {
   2213       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
   2214       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
   2215       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
   2216       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
   2217       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
   2218       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
   2219       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
   2220       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
   2221     };
   2222     for (unsigned i = 0; i < 8; i++) {
   2223       if (OperandIdx[i] < 0)
   2224         return Node;
   2225       SDValue &Src = Ops[OperandIdx[i] - 1];
   2226       SDValue &Neg = Ops[NegIdx[i] - 1];
   2227       SDValue &Abs = Ops[AbsIdx[i] - 1];
   2228       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
   2229       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
   2230       if (HasDst)
   2231         SelIdx--;
   2232       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
   2233       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
   2234         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
   2235     }
   2236   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
   2237     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
   2238       SDValue &Src = Ops[i];
   2239       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
   2240         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
   2241     }
   2242   } else if (Opcode == AMDGPU::CLAMP_R600) {
   2243     SDValue Src = Node->getOperand(0);
   2244     if (!Src.isMachineOpcode() ||
   2245         !TII->hasInstrModifiers(Src.getMachineOpcode()))
   2246       return Node;
   2247     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
   2248         AMDGPU::OpName::clamp);
   2249     if (ClampIdx < 0)
   2250       return Node;
   2251     std::vector<SDValue> Ops;
   2252     unsigned NumOp = Src.getNumOperands();
   2253     for(unsigned i = 0; i < NumOp; ++i)
   2254           Ops.push_back(Src.getOperand(i));
   2255     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
   2256     return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
   2257         Node->getVTList(), Ops);
   2258   } else {
   2259     if (!TII->hasInstrModifiers(Opcode))
   2260       return Node;
   2261     int OperandIdx[] = {
   2262       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
   2263       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
   2264       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
   2265     };
   2266     int NegIdx[] = {
   2267       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
   2268       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
   2269       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
   2270     };
   2271     int AbsIdx[] = {
   2272       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
   2273       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
   2274       -1
   2275     };
   2276     for (unsigned i = 0; i < 3; i++) {
   2277       if (OperandIdx[i] < 0)
   2278         return Node;
   2279       SDValue &Src = Ops[OperandIdx[i] - 1];
   2280       SDValue &Neg = Ops[NegIdx[i] - 1];
   2281       SDValue FakeAbs;
   2282       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
   2283       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
   2284       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
   2285       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
   2286       if (HasDst) {
   2287         SelIdx--;
   2288         ImmIdx--;
   2289       }
   2290       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
   2291       SDValue &Imm = Ops[ImmIdx];
   2292       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
   2293         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
   2294     }
   2295   }
   2296 
   2297   return Node;
   2298 }
   2299