Home | History | Annotate | Download | only in AMDGPU
      1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 /// \brief Custom DAG lowering for R600
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "R600ISelLowering.h"
     16 #include "AMDGPUFrameLowering.h"
     17 #include "AMDGPUIntrinsicInfo.h"
     18 #include "AMDGPUSubtarget.h"
     19 #include "R600Defines.h"
     20 #include "R600InstrInfo.h"
     21 #include "R600MachineFunctionInfo.h"
     22 #include "llvm/Analysis/ValueTracking.h"
     23 #include "llvm/CodeGen/CallingConvLower.h"
     24 #include "llvm/CodeGen/MachineFrameInfo.h"
     25 #include "llvm/CodeGen/MachineInstrBuilder.h"
     26 #include "llvm/CodeGen/MachineRegisterInfo.h"
     27 #include "llvm/CodeGen/SelectionDAG.h"
     28 #include "llvm/IR/Argument.h"
     29 #include "llvm/IR/Function.h"
     30 
     31 using namespace llvm;
     32 
     33 R600TargetLowering::R600TargetLowering(TargetMachine &TM,
     34                                        const AMDGPUSubtarget &STI)
     35     : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
     36   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
     37   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
     38   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
     39   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
     40   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
     41   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
     42 
     43   computeRegisterProperties(STI.getRegisterInfo());
     44 
     45   // Set condition code actions
     46   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
     47   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
     48   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
     49   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
     50   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
     51   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
     52   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
     53   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
     54   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
     55   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
     56   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
     57   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
     58 
     59   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
     60   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
     61   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
     62   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
     63 
     64   setOperationAction(ISD::FCOS, MVT::f32, Custom);
     65   setOperationAction(ISD::FSIN, MVT::f32, Custom);
     66 
     67   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
     68   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
     69 
     70   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
     71   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
     72   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
     73 
     74   setOperationAction(ISD::FSUB, MVT::f32, Expand);
     75 
     76   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
     77   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
     78   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
     79 
     80   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
     81   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
     82 
     83   setOperationAction(ISD::SETCC, MVT::i32, Expand);
     84   setOperationAction(ISD::SETCC, MVT::f32, Expand);
     85   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
     86   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
     87   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
     88 
     89   setOperationAction(ISD::SELECT, MVT::i32, Expand);
     90   setOperationAction(ISD::SELECT, MVT::f32, Expand);
     91   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
     92   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
     93 
     94   // ADD, SUB overflow.
     95   // TODO: turn these into Legal?
     96   if (Subtarget->hasCARRY())
     97     setOperationAction(ISD::UADDO, MVT::i32, Custom);
     98 
     99   if (Subtarget->hasBORROW())
    100     setOperationAction(ISD::USUBO, MVT::i32, Custom);
    101 
    102   // Expand sign extension of vectors
    103   if (!Subtarget->hasBFE())
    104     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
    105 
    106   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
    107   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
    108 
    109   if (!Subtarget->hasBFE())
    110     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
    111   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
    112   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
    113 
    114   if (!Subtarget->hasBFE())
    115     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
    116   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
    117   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
    118 
    119   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
    120   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
    121   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
    122 
    123   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
    124 
    125 
    126   // Legalize loads and stores to the private address space.
    127   setOperationAction(ISD::LOAD, MVT::i32, Custom);
    128   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
    129   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
    130 
    131   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
    132   // spaces, so it is custom lowered to handle those where it isn't.
    133   for (MVT VT : MVT::integer_valuetypes()) {
    134     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
    135     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
    136     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
    137 
    138     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
    139     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
    140     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
    141 
    142     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
    143     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
    144     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
    145   }
    146 
    147   setOperationAction(ISD::STORE, MVT::i8, Custom);
    148   setOperationAction(ISD::STORE, MVT::i32, Custom);
    149   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
    150   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
    151   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
    152   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
    153 
    154   setOperationAction(ISD::LOAD, MVT::i32, Custom);
    155   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
    156   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
    157 
    158   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
    159   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
    160   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
    161   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
    162 
    163   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
    164   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
    165   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
    166   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
    167 
    168   setTargetDAGCombine(ISD::FP_ROUND);
    169   setTargetDAGCombine(ISD::FP_TO_SINT);
    170   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
    171   setTargetDAGCombine(ISD::SELECT_CC);
    172   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
    173 
    174   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
    175   //  to be Legal/Custom in order to avoid library calls.
    176   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
    177   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
    178   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
    179 
    180   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
    181 
    182   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
    183   for (MVT VT : ScalarIntVTs) {
    184     setOperationAction(ISD::ADDC, VT, Expand);
    185     setOperationAction(ISD::SUBC, VT, Expand);
    186     setOperationAction(ISD::ADDE, VT, Expand);
    187     setOperationAction(ISD::SUBE, VT, Expand);
    188   }
    189 
    190   setSchedulingPreference(Sched::Source);
    191 }
    192 
    193 static inline bool isEOP(MachineBasicBlock::iterator I) {
    194   return std::next(I)->getOpcode() == AMDGPU::RETURN;
    195 }
    196 
    197 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
    198     MachineInstr * MI, MachineBasicBlock * BB) const {
    199   MachineFunction * MF = BB->getParent();
    200   MachineRegisterInfo &MRI = MF->getRegInfo();
    201   MachineBasicBlock::iterator I = *MI;
    202   const R600InstrInfo *TII =
    203       static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
    204 
    205   switch (MI->getOpcode()) {
    206   default:
    207     // Replace LDS_*_RET instruction that don't have any uses with the
    208     // equivalent LDS_*_NORET instruction.
    209     if (TII->isLDSRetInstr(MI->getOpcode())) {
    210       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
    211       assert(DstIdx != -1);
    212       MachineInstrBuilder NewMI;
    213       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
    214       //        LDS_1A2D support and remove this special case.
    215       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
    216            MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
    217         return BB;
    218 
    219       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
    220                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
    221       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
    222         NewMI.addOperand(MI->getOperand(i));
    223       }
    224     } else {
    225       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
    226     }
    227     break;
    228   case AMDGPU::CLAMP_R600: {
    229     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
    230                                                    AMDGPU::MOV,
    231                                                    MI->getOperand(0).getReg(),
    232                                                    MI->getOperand(1).getReg());
    233     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
    234     break;
    235   }
    236 
    237   case AMDGPU::FABS_R600: {
    238     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
    239                                                     AMDGPU::MOV,
    240                                                     MI->getOperand(0).getReg(),
    241                                                     MI->getOperand(1).getReg());
    242     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
    243     break;
    244   }
    245 
    246   case AMDGPU::FNEG_R600: {
    247     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
    248                                                     AMDGPU::MOV,
    249                                                     MI->getOperand(0).getReg(),
    250                                                     MI->getOperand(1).getReg());
    251     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
    252     break;
    253   }
    254 
    255   case AMDGPU::MASK_WRITE: {
    256     unsigned maskedRegister = MI->getOperand(0).getReg();
    257     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
    258     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
    259     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
    260     break;
    261   }
    262 
    263   case AMDGPU::MOV_IMM_F32:
    264     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
    265                      MI->getOperand(1).getFPImm()->getValueAPF()
    266                          .bitcastToAPInt().getZExtValue());
    267     break;
    268   case AMDGPU::MOV_IMM_I32:
    269     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
    270                      MI->getOperand(1).getImm());
    271     break;
    272   case AMDGPU::CONST_COPY: {
    273     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
    274         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
    275     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
    276         MI->getOperand(1).getImm());
    277     break;
    278   }
    279 
    280   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
    281   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
    282   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
    283     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
    284             .addOperand(MI->getOperand(0))
    285             .addOperand(MI->getOperand(1))
    286             .addImm(isEOP(I)); // Set End of program bit
    287     break;
    288   }
    289   case AMDGPU::RAT_STORE_TYPED_eg: {
    290     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
    291             .addOperand(MI->getOperand(0))
    292             .addOperand(MI->getOperand(1))
    293             .addOperand(MI->getOperand(2))
    294             .addImm(isEOP(I)); // Set End of program bit
    295     break;
    296   }
    297 
    298   case AMDGPU::TXD: {
    299     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
    300     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
    301     MachineOperand &RID = MI->getOperand(4);
    302     MachineOperand &SID = MI->getOperand(5);
    303     unsigned TextureId = MI->getOperand(6).getImm();
    304     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
    305     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
    306 
    307     switch (TextureId) {
    308     case 5: // Rect
    309       CTX = CTY = 0;
    310       break;
    311     case 6: // Shadow1D
    312       SrcW = SrcZ;
    313       break;
    314     case 7: // Shadow2D
    315       SrcW = SrcZ;
    316       break;
    317     case 8: // ShadowRect
    318       CTX = CTY = 0;
    319       SrcW = SrcZ;
    320       break;
    321     case 9: // 1DArray
    322       SrcZ = SrcY;
    323       CTZ = 0;
    324       break;
    325     case 10: // 2DArray
    326       CTZ = 0;
    327       break;
    328     case 11: // Shadow1DArray
    329       SrcZ = SrcY;
    330       CTZ = 0;
    331       break;
    332     case 12: // Shadow2DArray
    333       CTZ = 0;
    334       break;
    335     }
    336     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
    337             .addOperand(MI->getOperand(3))
    338             .addImm(SrcX)
    339             .addImm(SrcY)
    340             .addImm(SrcZ)
    341             .addImm(SrcW)
    342             .addImm(0)
    343             .addImm(0)
    344             .addImm(0)
    345             .addImm(0)
    346             .addImm(1)
    347             .addImm(2)
    348             .addImm(3)
    349             .addOperand(RID)
    350             .addOperand(SID)
    351             .addImm(CTX)
    352             .addImm(CTY)
    353             .addImm(CTZ)
    354             .addImm(CTW);
    355     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
    356             .addOperand(MI->getOperand(2))
    357             .addImm(SrcX)
    358             .addImm(SrcY)
    359             .addImm(SrcZ)
    360             .addImm(SrcW)
    361             .addImm(0)
    362             .addImm(0)
    363             .addImm(0)
    364             .addImm(0)
    365             .addImm(1)
    366             .addImm(2)
    367             .addImm(3)
    368             .addOperand(RID)
    369             .addOperand(SID)
    370             .addImm(CTX)
    371             .addImm(CTY)
    372             .addImm(CTZ)
    373             .addImm(CTW);
    374     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
    375             .addOperand(MI->getOperand(0))
    376             .addOperand(MI->getOperand(1))
    377             .addImm(SrcX)
    378             .addImm(SrcY)
    379             .addImm(SrcZ)
    380             .addImm(SrcW)
    381             .addImm(0)
    382             .addImm(0)
    383             .addImm(0)
    384             .addImm(0)
    385             .addImm(1)
    386             .addImm(2)
    387             .addImm(3)
    388             .addOperand(RID)
    389             .addOperand(SID)
    390             .addImm(CTX)
    391             .addImm(CTY)
    392             .addImm(CTZ)
    393             .addImm(CTW)
    394             .addReg(T0, RegState::Implicit)
    395             .addReg(T1, RegState::Implicit);
    396     break;
    397   }
    398 
    399   case AMDGPU::TXD_SHADOW: {
    400     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
    401     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
    402     MachineOperand &RID = MI->getOperand(4);
    403     MachineOperand &SID = MI->getOperand(5);
    404     unsigned TextureId = MI->getOperand(6).getImm();
    405     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
    406     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
    407 
    408     switch (TextureId) {
    409     case 5: // Rect
    410       CTX = CTY = 0;
    411       break;
    412     case 6: // Shadow1D
    413       SrcW = SrcZ;
    414       break;
    415     case 7: // Shadow2D
    416       SrcW = SrcZ;
    417       break;
    418     case 8: // ShadowRect
    419       CTX = CTY = 0;
    420       SrcW = SrcZ;
    421       break;
    422     case 9: // 1DArray
    423       SrcZ = SrcY;
    424       CTZ = 0;
    425       break;
    426     case 10: // 2DArray
    427       CTZ = 0;
    428       break;
    429     case 11: // Shadow1DArray
    430       SrcZ = SrcY;
    431       CTZ = 0;
    432       break;
    433     case 12: // Shadow2DArray
    434       CTZ = 0;
    435       break;
    436     }
    437 
    438     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
    439             .addOperand(MI->getOperand(3))
    440             .addImm(SrcX)
    441             .addImm(SrcY)
    442             .addImm(SrcZ)
    443             .addImm(SrcW)
    444             .addImm(0)
    445             .addImm(0)
    446             .addImm(0)
    447             .addImm(0)
    448             .addImm(1)
    449             .addImm(2)
    450             .addImm(3)
    451             .addOperand(RID)
    452             .addOperand(SID)
    453             .addImm(CTX)
    454             .addImm(CTY)
    455             .addImm(CTZ)
    456             .addImm(CTW);
    457     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
    458             .addOperand(MI->getOperand(2))
    459             .addImm(SrcX)
    460             .addImm(SrcY)
    461             .addImm(SrcZ)
    462             .addImm(SrcW)
    463             .addImm(0)
    464             .addImm(0)
    465             .addImm(0)
    466             .addImm(0)
    467             .addImm(1)
    468             .addImm(2)
    469             .addImm(3)
    470             .addOperand(RID)
    471             .addOperand(SID)
    472             .addImm(CTX)
    473             .addImm(CTY)
    474             .addImm(CTZ)
    475             .addImm(CTW);
    476     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
    477             .addOperand(MI->getOperand(0))
    478             .addOperand(MI->getOperand(1))
    479             .addImm(SrcX)
    480             .addImm(SrcY)
    481             .addImm(SrcZ)
    482             .addImm(SrcW)
    483             .addImm(0)
    484             .addImm(0)
    485             .addImm(0)
    486             .addImm(0)
    487             .addImm(1)
    488             .addImm(2)
    489             .addImm(3)
    490             .addOperand(RID)
    491             .addOperand(SID)
    492             .addImm(CTX)
    493             .addImm(CTY)
    494             .addImm(CTZ)
    495             .addImm(CTW)
    496             .addReg(T0, RegState::Implicit)
    497             .addReg(T1, RegState::Implicit);
    498     break;
    499   }
    500 
    501   case AMDGPU::BRANCH:
    502       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
    503               .addOperand(MI->getOperand(0));
    504       break;
    505 
    506   case AMDGPU::BRANCH_COND_f32: {
    507     MachineInstr *NewMI =
    508       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
    509               AMDGPU::PREDICATE_BIT)
    510               .addOperand(MI->getOperand(1))
    511               .addImm(OPCODE_IS_NOT_ZERO)
    512               .addImm(0); // Flags
    513     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
    514     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
    515             .addOperand(MI->getOperand(0))
    516             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
    517     break;
    518   }
    519 
    520   case AMDGPU::BRANCH_COND_i32: {
    521     MachineInstr *NewMI =
    522       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
    523             AMDGPU::PREDICATE_BIT)
    524             .addOperand(MI->getOperand(1))
    525             .addImm(OPCODE_IS_NOT_ZERO_INT)
    526             .addImm(0); // Flags
    527     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
    528     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
    529            .addOperand(MI->getOperand(0))
    530             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
    531     break;
    532   }
    533 
    534   case AMDGPU::EG_ExportSwz:
    535   case AMDGPU::R600_ExportSwz: {
    536     // Instruction is left unmodified if its not the last one of its type
    537     bool isLastInstructionOfItsType = true;
    538     unsigned InstExportType = MI->getOperand(1).getImm();
    539     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
    540          EndBlock = BB->end(); NextExportInst != EndBlock;
    541          NextExportInst = std::next(NextExportInst)) {
    542       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
    543           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
    544         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
    545             .getImm();
    546         if (CurrentInstExportType == InstExportType) {
    547           isLastInstructionOfItsType = false;
    548           break;
    549         }
    550       }
    551     }
    552     bool EOP = isEOP(I);
    553     if (!EOP && !isLastInstructionOfItsType)
    554       return BB;
    555     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
    556     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
    557             .addOperand(MI->getOperand(0))
    558             .addOperand(MI->getOperand(1))
    559             .addOperand(MI->getOperand(2))
    560             .addOperand(MI->getOperand(3))
    561             .addOperand(MI->getOperand(4))
    562             .addOperand(MI->getOperand(5))
    563             .addOperand(MI->getOperand(6))
    564             .addImm(CfInst)
    565             .addImm(EOP);
    566     break;
    567   }
    568   case AMDGPU::RETURN: {
    569     // RETURN instructions must have the live-out registers as implicit uses,
    570     // otherwise they appear dead.
    571     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
    572     MachineInstrBuilder MIB(*MF, MI);
    573     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
    574       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
    575     return BB;
    576   }
    577   }
    578 
    579   MI->eraseFromParent();
    580   return BB;
    581 }
    582 
    583 //===----------------------------------------------------------------------===//
    584 // Custom DAG Lowering Operations
    585 //===----------------------------------------------------------------------===//
    586 
    587 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    588   MachineFunction &MF = DAG.getMachineFunction();
    589   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
    590   switch (Op.getOpcode()) {
    591   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
    592   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
    593   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
    594   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
    595   case ISD::SRA_PARTS:
    596   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
    597   case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
    598   case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
    599   case ISD::FCOS:
    600   case ISD::FSIN: return LowerTrig(Op, DAG);
    601   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
    602   case ISD::STORE: return LowerSTORE(Op, DAG);
    603   case ISD::LOAD: {
    604     SDValue Result = LowerLOAD(Op, DAG);
    605     assert((!Result.getNode() ||
    606             Result.getNode()->getNumValues() == 2) &&
    607            "Load should return a value and a chain");
    608     return Result;
    609   }
    610 
    611   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
    612   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
    613   case ISD::INTRINSIC_VOID: {
    614     SDValue Chain = Op.getOperand(0);
    615     unsigned IntrinsicID =
    616                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    617     switch (IntrinsicID) {
    618     case AMDGPUIntrinsic::AMDGPU_store_output: {
    619       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
    620       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
    621       MFI->LiveOuts.push_back(Reg);
    622       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
    623     }
    624     case AMDGPUIntrinsic::R600_store_swizzle: {
    625       SDLoc DL(Op);
    626       const SDValue Args[8] = {
    627         Chain,
    628         Op.getOperand(2), // Export Value
    629         Op.getOperand(3), // ArrayBase
    630         Op.getOperand(4), // Type
    631         DAG.getConstant(0, DL, MVT::i32), // SWZ_X
    632         DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
    633         DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
    634         DAG.getConstant(3, DL, MVT::i32) // SWZ_W
    635       };
    636       return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args);
    637     }
    638 
    639     // default for switch(IntrinsicID)
    640     default: break;
    641     }
    642     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
    643     break;
    644   }
    645   case ISD::INTRINSIC_WO_CHAIN: {
    646     unsigned IntrinsicID =
    647                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    648     EVT VT = Op.getValueType();
    649     SDLoc DL(Op);
    650     switch(IntrinsicID) {
    651     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
    652     case AMDGPUIntrinsic::R600_load_input: {
    653       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    654       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
    655       MachineFunction &MF = DAG.getMachineFunction();
    656       MachineRegisterInfo &MRI = MF.getRegInfo();
    657       MRI.addLiveIn(Reg);
    658       return DAG.getCopyFromReg(DAG.getEntryNode(),
    659           SDLoc(DAG.getEntryNode()), Reg, VT);
    660     }
    661 
    662     case AMDGPUIntrinsic::R600_interp_input: {
    663       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    664       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
    665       MachineSDNode *interp;
    666       if (ijb < 0) {
    667         const R600InstrInfo *TII =
    668             static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
    669         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
    670             MVT::v4f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32));
    671         return DAG.getTargetExtractSubreg(
    672             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
    673             DL, MVT::f32, SDValue(interp, 0));
    674       }
    675       MachineFunction &MF = DAG.getMachineFunction();
    676       MachineRegisterInfo &MRI = MF.getRegInfo();
    677       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
    678       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
    679       MRI.addLiveIn(RegisterI);
    680       MRI.addLiveIn(RegisterJ);
    681       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
    682           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
    683       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
    684           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
    685 
    686       if (slot % 4 < 2)
    687         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
    688             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32),
    689             RegisterJNode, RegisterINode);
    690       else
    691         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
    692             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32),
    693             RegisterJNode, RegisterINode);
    694       return SDValue(interp, slot % 2);
    695     }
    696     case AMDGPUIntrinsic::R600_interp_xy:
    697     case AMDGPUIntrinsic::R600_interp_zw: {
    698       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    699       MachineSDNode *interp;
    700       SDValue RegisterINode = Op.getOperand(2);
    701       SDValue RegisterJNode = Op.getOperand(3);
    702 
    703       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
    704         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
    705             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
    706             RegisterJNode, RegisterINode);
    707       else
    708         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
    709             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
    710             RegisterJNode, RegisterINode);
    711       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
    712           SDValue(interp, 0), SDValue(interp, 1));
    713     }
    714     case AMDGPUIntrinsic::R600_tex:
    715     case AMDGPUIntrinsic::R600_texc:
    716     case AMDGPUIntrinsic::R600_txl:
    717     case AMDGPUIntrinsic::R600_txlc:
    718     case AMDGPUIntrinsic::R600_txb:
    719     case AMDGPUIntrinsic::R600_txbc:
    720     case AMDGPUIntrinsic::R600_txf:
    721     case AMDGPUIntrinsic::R600_txq:
    722     case AMDGPUIntrinsic::R600_ddx:
    723     case AMDGPUIntrinsic::R600_ddy:
    724     case AMDGPUIntrinsic::R600_ldptr: {
    725       unsigned TextureOp;
    726       switch (IntrinsicID) {
    727       case AMDGPUIntrinsic::R600_tex:
    728         TextureOp = 0;
    729         break;
    730       case AMDGPUIntrinsic::R600_texc:
    731         TextureOp = 1;
    732         break;
    733       case AMDGPUIntrinsic::R600_txl:
    734         TextureOp = 2;
    735         break;
    736       case AMDGPUIntrinsic::R600_txlc:
    737         TextureOp = 3;
    738         break;
    739       case AMDGPUIntrinsic::R600_txb:
    740         TextureOp = 4;
    741         break;
    742       case AMDGPUIntrinsic::R600_txbc:
    743         TextureOp = 5;
    744         break;
    745       case AMDGPUIntrinsic::R600_txf:
    746         TextureOp = 6;
    747         break;
    748       case AMDGPUIntrinsic::R600_txq:
    749         TextureOp = 7;
    750         break;
    751       case AMDGPUIntrinsic::R600_ddx:
    752         TextureOp = 8;
    753         break;
    754       case AMDGPUIntrinsic::R600_ddy:
    755         TextureOp = 9;
    756         break;
    757       case AMDGPUIntrinsic::R600_ldptr:
    758         TextureOp = 10;
    759         break;
    760       default:
    761         llvm_unreachable("Unknow Texture Operation");
    762       }
    763 
    764       SDValue TexArgs[19] = {
    765         DAG.getConstant(TextureOp, DL, MVT::i32),
    766         Op.getOperand(1),
    767         DAG.getConstant(0, DL, MVT::i32),
    768         DAG.getConstant(1, DL, MVT::i32),
    769         DAG.getConstant(2, DL, MVT::i32),
    770         DAG.getConstant(3, DL, MVT::i32),
    771         Op.getOperand(2),
    772         Op.getOperand(3),
    773         Op.getOperand(4),
    774         DAG.getConstant(0, DL, MVT::i32),
    775         DAG.getConstant(1, DL, MVT::i32),
    776         DAG.getConstant(2, DL, MVT::i32),
    777         DAG.getConstant(3, DL, MVT::i32),
    778         Op.getOperand(5),
    779         Op.getOperand(6),
    780         Op.getOperand(7),
    781         Op.getOperand(8),
    782         Op.getOperand(9),
    783         Op.getOperand(10)
    784       };
    785       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
    786     }
    787     case AMDGPUIntrinsic::AMDGPU_dp4: {
    788       SDValue Args[8] = {
    789       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
    790           DAG.getConstant(0, DL, MVT::i32)),
    791       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
    792           DAG.getConstant(0, DL, MVT::i32)),
    793       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
    794           DAG.getConstant(1, DL, MVT::i32)),
    795       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
    796           DAG.getConstant(1, DL, MVT::i32)),
    797       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
    798           DAG.getConstant(2, DL, MVT::i32)),
    799       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
    800           DAG.getConstant(2, DL, MVT::i32)),
    801       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
    802           DAG.getConstant(3, DL, MVT::i32)),
    803       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
    804           DAG.getConstant(3, DL, MVT::i32))
    805       };
    806       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
    807     }
    808 
    809     case Intrinsic::r600_read_ngroups_x:
    810       return LowerImplicitParameter(DAG, VT, DL, 0);
    811     case Intrinsic::r600_read_ngroups_y:
    812       return LowerImplicitParameter(DAG, VT, DL, 1);
    813     case Intrinsic::r600_read_ngroups_z:
    814       return LowerImplicitParameter(DAG, VT, DL, 2);
    815     case Intrinsic::r600_read_global_size_x:
    816       return LowerImplicitParameter(DAG, VT, DL, 3);
    817     case Intrinsic::r600_read_global_size_y:
    818       return LowerImplicitParameter(DAG, VT, DL, 4);
    819     case Intrinsic::r600_read_global_size_z:
    820       return LowerImplicitParameter(DAG, VT, DL, 5);
    821     case Intrinsic::r600_read_local_size_x:
    822       return LowerImplicitParameter(DAG, VT, DL, 6);
    823     case Intrinsic::r600_read_local_size_y:
    824       return LowerImplicitParameter(DAG, VT, DL, 7);
    825     case Intrinsic::r600_read_local_size_z:
    826       return LowerImplicitParameter(DAG, VT, DL, 8);
    827 
    828     case Intrinsic::AMDGPU_read_workdim: {
    829       uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM);
    830       return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4);
    831     }
    832 
    833     case Intrinsic::r600_read_tgid_x:
    834       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    835                                   AMDGPU::T1_X, VT);
    836     case Intrinsic::r600_read_tgid_y:
    837       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    838                                   AMDGPU::T1_Y, VT);
    839     case Intrinsic::r600_read_tgid_z:
    840       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    841                                   AMDGPU::T1_Z, VT);
    842     case Intrinsic::r600_read_tidig_x:
    843       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    844                                   AMDGPU::T0_X, VT);
    845     case Intrinsic::r600_read_tidig_y:
    846       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    847                                   AMDGPU::T0_Y, VT);
    848     case Intrinsic::r600_read_tidig_z:
    849       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    850                                   AMDGPU::T0_Z, VT);
    851     case Intrinsic::AMDGPU_rsq:
    852       // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
    853       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
    854 
    855     case AMDGPUIntrinsic::AMDGPU_fract:
    856     case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
    857       return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
    858     }
    859     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
    860     break;
    861   }
    862   } // end switch(Op.getOpcode())
    863   return SDValue();
    864 }
    865 
    866 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
    867                                             SmallVectorImpl<SDValue> &Results,
    868                                             SelectionDAG &DAG) const {
    869   switch (N->getOpcode()) {
    870   default:
    871     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
    872     return;
    873   case ISD::FP_TO_UINT:
    874     if (N->getValueType(0) == MVT::i1) {
    875       Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
    876       return;
    877     }
    878     // Fall-through. Since we don't care about out of bounds values
    879     // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
    880     // considers some extra cases which are not necessary here.
    881   case ISD::FP_TO_SINT: {
    882     SDValue Result;
    883     if (expandFP_TO_SINT(N, Result, DAG))
    884       Results.push_back(Result);
    885     return;
    886   }
    887   case ISD::SDIVREM: {
    888     SDValue Op = SDValue(N, 1);
    889     SDValue RES = LowerSDIVREM(Op, DAG);
    890     Results.push_back(RES);
    891     Results.push_back(RES.getValue(1));
    892     break;
    893   }
    894   case ISD::UDIVREM: {
    895     SDValue Op = SDValue(N, 0);
    896     LowerUDIVREM64(Op, DAG, Results);
    897     break;
    898   }
    899   }
    900 }
    901 
    902 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
    903                                                    SDValue Vector) const {
    904 
    905   SDLoc DL(Vector);
    906   EVT VecVT = Vector.getValueType();
    907   EVT EltVT = VecVT.getVectorElementType();
    908   SmallVector<SDValue, 8> Args;
    909 
    910   for (unsigned i = 0, e = VecVT.getVectorNumElements();
    911                                                            i != e; ++i) {
    912     Args.push_back(DAG.getNode(
    913         ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
    914         DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
    915   }
    916 
    917   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
    918 }
    919 
    920 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
    921                                                     SelectionDAG &DAG) const {
    922 
    923   SDLoc DL(Op);
    924   SDValue Vector = Op.getOperand(0);
    925   SDValue Index = Op.getOperand(1);
    926 
    927   if (isa<ConstantSDNode>(Index) ||
    928       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
    929     return Op;
    930 
    931   Vector = vectorToVerticalVector(DAG, Vector);
    932   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
    933                      Vector, Index);
    934 }
    935 
    936 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
    937                                                    SelectionDAG &DAG) const {
    938   SDLoc DL(Op);
    939   SDValue Vector = Op.getOperand(0);
    940   SDValue Value = Op.getOperand(1);
    941   SDValue Index = Op.getOperand(2);
    942 
    943   if (isa<ConstantSDNode>(Index) ||
    944       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
    945     return Op;
    946 
    947   Vector = vectorToVerticalVector(DAG, Vector);
    948   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
    949                                Vector, Value, Index);
    950   return vectorToVerticalVector(DAG, Insert);
    951 }
    952 
    953 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
    954   // On hw >= R700, COS/SIN input must be between -1. and 1.
    955   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
    956   EVT VT = Op.getValueType();
    957   SDValue Arg = Op.getOperand(0);
    958   SDLoc DL(Op);
    959 
    960   // TODO: Should this propagate fast-math-flags?
    961   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
    962       DAG.getNode(ISD::FADD, DL, VT,
    963         DAG.getNode(ISD::FMUL, DL, VT, Arg,
    964           DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
    965         DAG.getConstantFP(0.5, DL, MVT::f32)));
    966   unsigned TrigNode;
    967   switch (Op.getOpcode()) {
    968   case ISD::FCOS:
    969     TrigNode = AMDGPUISD::COS_HW;
    970     break;
    971   case ISD::FSIN:
    972     TrigNode = AMDGPUISD::SIN_HW;
    973     break;
    974   default:
    975     llvm_unreachable("Wrong trig opcode");
    976   }
    977   SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
    978       DAG.getNode(ISD::FADD, DL, VT, FractPart,
    979         DAG.getConstantFP(-0.5, DL, MVT::f32)));
    980   if (Gen >= AMDGPUSubtarget::R700)
    981     return TrigVal;
    982   // On R600 hw, COS/SIN input must be between -Pi and Pi.
    983   return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
    984       DAG.getConstantFP(3.14159265359, DL, MVT::f32));
    985 }
    986 
    987 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
    988   SDLoc DL(Op);
    989   EVT VT = Op.getValueType();
    990 
    991   SDValue Lo = Op.getOperand(0);
    992   SDValue Hi = Op.getOperand(1);
    993   SDValue Shift = Op.getOperand(2);
    994   SDValue Zero = DAG.getConstant(0, DL, VT);
    995   SDValue One  = DAG.getConstant(1, DL, VT);
    996 
    997   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
    998   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
    999   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
   1000   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
   1001 
   1002   // The dance around Width1 is necessary for 0 special case.
   1003   // Without it the CompShift might be 32, producing incorrect results in
   1004   // Overflow. So we do the shift in two steps, the alternative is to
   1005   // add a conditional to filter the special case.
   1006 
   1007   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
   1008   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
   1009 
   1010   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
   1011   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
   1012   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
   1013 
   1014   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
   1015   SDValue LoBig = Zero;
   1016 
   1017   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
   1018   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
   1019 
   1020   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
   1021 }
   1022 
   1023 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
   1024   SDLoc DL(Op);
   1025   EVT VT = Op.getValueType();
   1026 
   1027   SDValue Lo = Op.getOperand(0);
   1028   SDValue Hi = Op.getOperand(1);
   1029   SDValue Shift = Op.getOperand(2);
   1030   SDValue Zero = DAG.getConstant(0, DL, VT);
   1031   SDValue One  = DAG.getConstant(1, DL, VT);
   1032 
   1033   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
   1034 
   1035   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
   1036   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
   1037   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
   1038   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
   1039 
   1040   // The dance around Width1 is necessary for 0 special case.
   1041   // Without it the CompShift might be 32, producing incorrect results in
   1042   // Overflow. So we do the shift in two steps, the alternative is to
   1043   // add a conditional to filter the special case.
   1044 
   1045   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
   1046   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
   1047 
   1048   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
   1049   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
   1050   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
   1051 
   1052   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
   1053   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
   1054 
   1055   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
   1056   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
   1057 
   1058   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
   1059 }
   1060 
   1061 SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
   1062                                           unsigned mainop, unsigned ovf) const {
   1063   SDLoc DL(Op);
   1064   EVT VT = Op.getValueType();
   1065 
   1066   SDValue Lo = Op.getOperand(0);
   1067   SDValue Hi = Op.getOperand(1);
   1068 
   1069   SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
   1070   // Extend sign.
   1071   OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
   1072                     DAG.getValueType(MVT::i1));
   1073 
   1074   SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
   1075 
   1076   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
   1077 }
   1078 
   1079 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
   1080   SDLoc DL(Op);
   1081   return DAG.getNode(
   1082       ISD::SETCC,
   1083       DL,
   1084       MVT::i1,
   1085       Op, DAG.getConstantFP(0.0f, DL, MVT::f32),
   1086       DAG.getCondCode(ISD::SETNE)
   1087       );
   1088 }
   1089 
   1090 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
   1091                                                    SDLoc DL,
   1092                                                    unsigned DwordOffset) const {
   1093   unsigned ByteOffset = DwordOffset * 4;
   1094   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
   1095                                       AMDGPUAS::CONSTANT_BUFFER_0);
   1096 
   1097   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
   1098   assert(isInt<16>(ByteOffset));
   1099 
   1100   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
   1101                      DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
   1102                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
   1103                      false, false, false, 0);
   1104 }
   1105 
   1106 bool R600TargetLowering::isZero(SDValue Op) const {
   1107   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
   1108     return Cst->isNullValue();
   1109   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
   1110     return CstFP->isZero();
   1111   } else {
   1112     return false;
   1113   }
   1114 }
   1115 
   1116 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   1117   SDLoc DL(Op);
   1118   EVT VT = Op.getValueType();
   1119 
   1120   SDValue LHS = Op.getOperand(0);
   1121   SDValue RHS = Op.getOperand(1);
   1122   SDValue True = Op.getOperand(2);
   1123   SDValue False = Op.getOperand(3);
   1124   SDValue CC = Op.getOperand(4);
   1125   SDValue Temp;
   1126 
   1127   if (VT == MVT::f32) {
   1128     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
   1129     SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
   1130     if (MinMax)
   1131       return MinMax;
   1132   }
   1133 
   1134   // LHS and RHS are guaranteed to be the same value type
   1135   EVT CompareVT = LHS.getValueType();
   1136 
   1137   // Check if we can lower this to a native operation.
   1138 
   1139   // Try to lower to a SET* instruction:
   1140   //
   1141   // SET* can match the following patterns:
   1142   //
   1143   // select_cc f32, f32, -1,  0, cc_supported
   1144   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
   1145   // select_cc i32, i32, -1,  0, cc_supported
   1146   //
   1147 
   1148   // Move hardware True/False values to the correct operand.
   1149   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
   1150   ISD::CondCode InverseCC =
   1151      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
   1152   if (isHWTrueValue(False) && isHWFalseValue(True)) {
   1153     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
   1154       std::swap(False, True);
   1155       CC = DAG.getCondCode(InverseCC);
   1156     } else {
   1157       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
   1158       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
   1159         std::swap(False, True);
   1160         std::swap(LHS, RHS);
   1161         CC = DAG.getCondCode(SwapInvCC);
   1162       }
   1163     }
   1164   }
   1165 
   1166   if (isHWTrueValue(True) && isHWFalseValue(False) &&
   1167       (CompareVT == VT || VT == MVT::i32)) {
   1168     // This can be matched by a SET* instruction.
   1169     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
   1170   }
   1171 
   1172   // Try to lower to a CND* instruction:
   1173   //
   1174   // CND* can match the following patterns:
   1175   //
   1176   // select_cc f32, 0.0, f32, f32, cc_supported
   1177   // select_cc f32, 0.0, i32, i32, cc_supported
   1178   // select_cc i32, 0,   f32, f32, cc_supported
   1179   // select_cc i32, 0,   i32, i32, cc_supported
   1180   //
   1181 
   1182   // Try to move the zero value to the RHS
   1183   if (isZero(LHS)) {
   1184     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
   1185     // Try swapping the operands
   1186     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
   1187     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
   1188       std::swap(LHS, RHS);
   1189       CC = DAG.getCondCode(CCSwapped);
   1190     } else {
   1191       // Try inverting the conditon and then swapping the operands
   1192       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
   1193       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
   1194       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
   1195         std::swap(True, False);
   1196         std::swap(LHS, RHS);
   1197         CC = DAG.getCondCode(CCSwapped);
   1198       }
   1199     }
   1200   }
   1201   if (isZero(RHS)) {
   1202     SDValue Cond = LHS;
   1203     SDValue Zero = RHS;
   1204     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
   1205     if (CompareVT != VT) {
   1206       // Bitcast True / False to the correct types.  This will end up being
   1207       // a nop, but it allows us to define only a single pattern in the
   1208       // .TD files for each CND* instruction rather than having to have
   1209       // one pattern for integer True/False and one for fp True/False
   1210       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
   1211       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
   1212     }
   1213 
   1214     switch (CCOpcode) {
   1215     case ISD::SETONE:
   1216     case ISD::SETUNE:
   1217     case ISD::SETNE:
   1218       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
   1219       Temp = True;
   1220       True = False;
   1221       False = Temp;
   1222       break;
   1223     default:
   1224       break;
   1225     }
   1226     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
   1227         Cond, Zero,
   1228         True, False,
   1229         DAG.getCondCode(CCOpcode));
   1230     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
   1231   }
   1232 
   1233   // If we make it this for it means we have no native instructions to handle
   1234   // this SELECT_CC, so we must lower it.
   1235   SDValue HWTrue, HWFalse;
   1236 
   1237   if (CompareVT == MVT::f32) {
   1238     HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
   1239     HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
   1240   } else if (CompareVT == MVT::i32) {
   1241     HWTrue = DAG.getConstant(-1, DL, CompareVT);
   1242     HWFalse = DAG.getConstant(0, DL, CompareVT);
   1243   }
   1244   else {
   1245     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
   1246   }
   1247 
   1248   // Lower this unsupported SELECT_CC into a combination of two supported
   1249   // SELECT_CC operations.
   1250   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
   1251 
   1252   return DAG.getNode(ISD::SELECT_CC, DL, VT,
   1253       Cond, HWFalse,
   1254       True, False,
   1255       DAG.getCondCode(ISD::SETNE));
   1256 }
   1257 
   1258 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
   1259 /// convert these pointers to a register index.  Each register holds
   1260 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
   1261 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
   1262 /// for indirect addressing.
   1263 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
   1264                                                unsigned StackWidth,
   1265                                                SelectionDAG &DAG) const {
   1266   unsigned SRLPad;
   1267   switch(StackWidth) {
   1268   case 1:
   1269     SRLPad = 2;
   1270     break;
   1271   case 2:
   1272     SRLPad = 3;
   1273     break;
   1274   case 4:
   1275     SRLPad = 4;
   1276     break;
   1277   default: llvm_unreachable("Invalid stack width");
   1278   }
   1279 
   1280   SDLoc DL(Ptr);
   1281   return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
   1282                      DAG.getConstant(SRLPad, DL, MVT::i32));
   1283 }
   1284 
   1285 void R600TargetLowering::getStackAddress(unsigned StackWidth,
   1286                                          unsigned ElemIdx,
   1287                                          unsigned &Channel,
   1288                                          unsigned &PtrIncr) const {
   1289   switch (StackWidth) {
   1290   default:
   1291   case 1:
   1292     Channel = 0;
   1293     if (ElemIdx > 0) {
   1294       PtrIncr = 1;
   1295     } else {
   1296       PtrIncr = 0;
   1297     }
   1298     break;
   1299   case 2:
   1300     Channel = ElemIdx % 2;
   1301     if (ElemIdx == 2) {
   1302       PtrIncr = 1;
   1303     } else {
   1304       PtrIncr = 0;
   1305     }
   1306     break;
   1307   case 4:
   1308     Channel = ElemIdx;
   1309     PtrIncr = 0;
   1310     break;
   1311   }
   1312 }
   1313 
   1314 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   1315   SDLoc DL(Op);
   1316   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
   1317   SDValue Chain = Op.getOperand(0);
   1318   SDValue Value = Op.getOperand(1);
   1319   SDValue Ptr = Op.getOperand(2);
   1320 
   1321   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
   1322   if (Result.getNode()) {
   1323     return Result;
   1324   }
   1325 
   1326   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
   1327     if (StoreNode->isTruncatingStore()) {
   1328       EVT VT = Value.getValueType();
   1329       assert(VT.bitsLE(MVT::i32));
   1330       EVT MemVT = StoreNode->getMemoryVT();
   1331       SDValue MaskConstant;
   1332       if (MemVT == MVT::i8) {
   1333         MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
   1334       } else {
   1335         assert(MemVT == MVT::i16);
   1336         MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
   1337       }
   1338       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
   1339                                       DAG.getConstant(2, DL, MVT::i32));
   1340       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
   1341                                       DAG.getConstant(0x00000003, DL, VT));
   1342       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
   1343       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
   1344                                    DAG.getConstant(3, DL, VT));
   1345       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
   1346       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
   1347       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
   1348       // vector instead.
   1349       SDValue Src[4] = {
   1350         ShiftedValue,
   1351         DAG.getConstant(0, DL, MVT::i32),
   1352         DAG.getConstant(0, DL, MVT::i32),
   1353         Mask
   1354       };
   1355       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
   1356       SDValue Args[3] = { Chain, Input, DWordAddr };
   1357       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
   1358                                      Op->getVTList(), Args, MemVT,
   1359                                      StoreNode->getMemOperand());
   1360     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
   1361                Value.getValueType().bitsGE(MVT::i32)) {
   1362       // Convert pointer from byte address to dword address.
   1363       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
   1364                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
   1365                                     Ptr, DAG.getConstant(2, DL, MVT::i32)));
   1366 
   1367       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
   1368         llvm_unreachable("Truncated and indexed stores not supported yet");
   1369       } else {
   1370         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
   1371       }
   1372       return Chain;
   1373     }
   1374   }
   1375 
   1376   EVT ValueVT = Value.getValueType();
   1377 
   1378   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
   1379     return SDValue();
   1380   }
   1381 
   1382   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
   1383   if (Ret.getNode()) {
   1384     return Ret;
   1385   }
   1386   // Lowering for indirect addressing
   1387 
   1388   const MachineFunction &MF = DAG.getMachineFunction();
   1389   const AMDGPUFrameLowering *TFL =
   1390       static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
   1391   unsigned StackWidth = TFL->getStackWidth(MF);
   1392 
   1393   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
   1394 
   1395   if (ValueVT.isVector()) {
   1396     unsigned NumElemVT = ValueVT.getVectorNumElements();
   1397     EVT ElemVT = ValueVT.getVectorElementType();
   1398     SmallVector<SDValue, 4> Stores(NumElemVT);
   1399 
   1400     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
   1401                                       "vector width in load");
   1402 
   1403     for (unsigned i = 0; i < NumElemVT; ++i) {
   1404       unsigned Channel, PtrIncr;
   1405       getStackAddress(StackWidth, i, Channel, PtrIncr);
   1406       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
   1407                         DAG.getConstant(PtrIncr, DL, MVT::i32));
   1408       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
   1409                                  Value, DAG.getConstant(i, DL, MVT::i32));
   1410 
   1411       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
   1412                               Chain, Elem, Ptr,
   1413                               DAG.getTargetConstant(Channel, DL, MVT::i32));
   1414     }
   1415      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
   1416    } else {
   1417     if (ValueVT == MVT::i8) {
   1418       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
   1419     }
   1420     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
   1421     DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
   1422   }
   1423 
   1424   return Chain;
   1425 }
   1426 
   1427 // return (512 + (kc_bank << 12)
   1428 static int
   1429 ConstantAddressBlock(unsigned AddressSpace) {
   1430   switch (AddressSpace) {
   1431   case AMDGPUAS::CONSTANT_BUFFER_0:
   1432     return 512;
   1433   case AMDGPUAS::CONSTANT_BUFFER_1:
   1434     return 512 + 4096;
   1435   case AMDGPUAS::CONSTANT_BUFFER_2:
   1436     return 512 + 4096 * 2;
   1437   case AMDGPUAS::CONSTANT_BUFFER_3:
   1438     return 512 + 4096 * 3;
   1439   case AMDGPUAS::CONSTANT_BUFFER_4:
   1440     return 512 + 4096 * 4;
   1441   case AMDGPUAS::CONSTANT_BUFFER_5:
   1442     return 512 + 4096 * 5;
   1443   case AMDGPUAS::CONSTANT_BUFFER_6:
   1444     return 512 + 4096 * 6;
   1445   case AMDGPUAS::CONSTANT_BUFFER_7:
   1446     return 512 + 4096 * 7;
   1447   case AMDGPUAS::CONSTANT_BUFFER_8:
   1448     return 512 + 4096 * 8;
   1449   case AMDGPUAS::CONSTANT_BUFFER_9:
   1450     return 512 + 4096 * 9;
   1451   case AMDGPUAS::CONSTANT_BUFFER_10:
   1452     return 512 + 4096 * 10;
   1453   case AMDGPUAS::CONSTANT_BUFFER_11:
   1454     return 512 + 4096 * 11;
   1455   case AMDGPUAS::CONSTANT_BUFFER_12:
   1456     return 512 + 4096 * 12;
   1457   case AMDGPUAS::CONSTANT_BUFFER_13:
   1458     return 512 + 4096 * 13;
   1459   case AMDGPUAS::CONSTANT_BUFFER_14:
   1460     return 512 + 4096 * 14;
   1461   case AMDGPUAS::CONSTANT_BUFFER_15:
   1462     return 512 + 4096 * 15;
   1463   default:
   1464     return -1;
   1465   }
   1466 }
   1467 
   1468 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
   1469 {
   1470   EVT VT = Op.getValueType();
   1471   SDLoc DL(Op);
   1472   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
   1473   SDValue Chain = Op.getOperand(0);
   1474   SDValue Ptr = Op.getOperand(1);
   1475   SDValue LoweredLoad;
   1476 
   1477   if (SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG))
   1478     return Ret;
   1479 
   1480   // Lower loads constant address space global variable loads
   1481   if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
   1482       isa<GlobalVariable>(GetUnderlyingObject(
   1483           LoadNode->getMemOperand()->getValue(), DAG.getDataLayout()))) {
   1484 
   1485     SDValue Ptr = DAG.getZExtOrTrunc(
   1486         LoadNode->getBasePtr(), DL,
   1487         getPointerTy(DAG.getDataLayout(), AMDGPUAS::PRIVATE_ADDRESS));
   1488     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
   1489         DAG.getConstant(2, DL, MVT::i32));
   1490     return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
   1491                        LoadNode->getChain(), Ptr,
   1492                        DAG.getTargetConstant(0, DL, MVT::i32),
   1493                        Op.getOperand(2));
   1494   }
   1495 
   1496   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
   1497     SDValue MergedValues[2] = {
   1498       ScalarizeVectorLoad(Op, DAG),
   1499       Chain
   1500     };
   1501     return DAG.getMergeValues(MergedValues, DL);
   1502   }
   1503 
   1504   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
   1505   if (ConstantBlock > -1 &&
   1506       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
   1507        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
   1508     SDValue Result;
   1509     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
   1510         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
   1511         isa<ConstantSDNode>(Ptr)) {
   1512       SDValue Slots[4];
   1513       for (unsigned i = 0; i < 4; i++) {
   1514         // We want Const position encoded with the following formula :
   1515         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
   1516         // const_index is Ptr computed by llvm using an alignment of 16.
   1517         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
   1518         // then div by 4 at the ISel step
   1519         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
   1520             DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
   1521         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
   1522       }
   1523       EVT NewVT = MVT::v4i32;
   1524       unsigned NumElements = 4;
   1525       if (VT.isVector()) {
   1526         NewVT = VT;
   1527         NumElements = VT.getVectorNumElements();
   1528       }
   1529       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
   1530                            makeArrayRef(Slots, NumElements));
   1531     } else {
   1532       // non-constant ptr can't be folded, keeps it as a v4f32 load
   1533       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
   1534           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
   1535                       DAG.getConstant(4, DL, MVT::i32)),
   1536                       DAG.getConstant(LoadNode->getAddressSpace() -
   1537                                       AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
   1538           );
   1539     }
   1540 
   1541     if (!VT.isVector()) {
   1542       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
   1543                            DAG.getConstant(0, DL, MVT::i32));
   1544     }
   1545 
   1546     SDValue MergedValues[2] = {
   1547       Result,
   1548       Chain
   1549     };
   1550     return DAG.getMergeValues(MergedValues, DL);
   1551   }
   1552 
   1553   // For most operations returning SDValue() will result in the node being
   1554   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
   1555   // need to manually expand loads that may be legal in some address spaces and
   1556   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
   1557   // compute shaders, since the data is sign extended when it is uploaded to the
   1558   // buffer. However SEXT loads from other address spaces are not supported, so
   1559   // we need to expand them here.
   1560   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
   1561     EVT MemVT = LoadNode->getMemoryVT();
   1562     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
   1563     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
   1564                                   LoadNode->getPointerInfo(), MemVT,
   1565                                   LoadNode->isVolatile(),
   1566                                   LoadNode->isNonTemporal(),
   1567                                   LoadNode->isInvariant(),
   1568                                   LoadNode->getAlignment());
   1569     SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
   1570                               DAG.getValueType(MemVT));
   1571 
   1572     SDValue MergedValues[2] = { Res, Chain };
   1573     return DAG.getMergeValues(MergedValues, DL);
   1574   }
   1575 
   1576   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
   1577     return SDValue();
   1578   }
   1579 
   1580   // Lowering for indirect addressing
   1581   const MachineFunction &MF = DAG.getMachineFunction();
   1582   const AMDGPUFrameLowering *TFL =
   1583       static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
   1584   unsigned StackWidth = TFL->getStackWidth(MF);
   1585 
   1586   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
   1587 
   1588   if (VT.isVector()) {
   1589     unsigned NumElemVT = VT.getVectorNumElements();
   1590     EVT ElemVT = VT.getVectorElementType();
   1591     SDValue Loads[4];
   1592 
   1593     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
   1594                                       "vector width in load");
   1595 
   1596     for (unsigned i = 0; i < NumElemVT; ++i) {
   1597       unsigned Channel, PtrIncr;
   1598       getStackAddress(StackWidth, i, Channel, PtrIncr);
   1599       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
   1600                         DAG.getConstant(PtrIncr, DL, MVT::i32));
   1601       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
   1602                              Chain, Ptr,
   1603                              DAG.getTargetConstant(Channel, DL, MVT::i32),
   1604                              Op.getOperand(2));
   1605     }
   1606     for (unsigned i = NumElemVT; i < 4; ++i) {
   1607       Loads[i] = DAG.getUNDEF(ElemVT);
   1608     }
   1609     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
   1610     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
   1611   } else {
   1612     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
   1613                               Chain, Ptr,
   1614                               DAG.getTargetConstant(0, DL, MVT::i32), // Channel
   1615                               Op.getOperand(2));
   1616   }
   1617 
   1618   SDValue Ops[2] = {
   1619     LoweredLoad,
   1620     Chain
   1621   };
   1622 
   1623   return DAG.getMergeValues(Ops, DL);
   1624 }
   1625 
   1626 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   1627   SDValue Chain = Op.getOperand(0);
   1628   SDValue Cond  = Op.getOperand(1);
   1629   SDValue Jump  = Op.getOperand(2);
   1630 
   1631   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
   1632                      Chain, Jump, Cond);
   1633 }
   1634 
   1635 /// XXX Only kernel functions are supported, so we can assume for now that
   1636 /// every function is a kernel function, but in the future we should use
   1637 /// separate calling conventions for kernel and non-kernel functions.
   1638 SDValue R600TargetLowering::LowerFormalArguments(
   1639                                       SDValue Chain,
   1640                                       CallingConv::ID CallConv,
   1641                                       bool isVarArg,
   1642                                       const SmallVectorImpl<ISD::InputArg> &Ins,
   1643                                       SDLoc DL, SelectionDAG &DAG,
   1644                                       SmallVectorImpl<SDValue> &InVals) const {
   1645   SmallVector<CCValAssign, 16> ArgLocs;
   1646   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
   1647                  *DAG.getContext());
   1648   MachineFunction &MF = DAG.getMachineFunction();
   1649   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
   1650 
   1651   SmallVector<ISD::InputArg, 8> LocalIns;
   1652 
   1653   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
   1654 
   1655   AnalyzeFormalArguments(CCInfo, LocalIns);
   1656 
   1657   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
   1658     CCValAssign &VA = ArgLocs[i];
   1659     const ISD::InputArg &In = Ins[i];
   1660     EVT VT = In.VT;
   1661     EVT MemVT = VA.getLocVT();
   1662     if (!VT.isVector() && MemVT.isVector()) {
   1663       // Get load source type if scalarized.
   1664       MemVT = MemVT.getVectorElementType();
   1665     }
   1666 
   1667     if (MFI->getShaderType() != ShaderType::COMPUTE) {
   1668       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
   1669       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
   1670       InVals.push_back(Register);
   1671       continue;
   1672     }
   1673 
   1674     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
   1675                                           AMDGPUAS::CONSTANT_BUFFER_0);
   1676 
   1677     // i64 isn't a legal type, so the register type used ends up as i32, which
   1678     // isn't expected here. It attempts to create this sextload, but it ends up
   1679     // being invalid. Somehow this seems to work with i64 arguments, but breaks
   1680     // for <1 x i64>.
   1681 
   1682     // The first 36 bytes of the input buffer contains information about
   1683     // thread group and global sizes.
   1684     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
   1685     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
   1686       // FIXME: This should really check the extload type, but the handling of
   1687       // extload vector parameters seems to be broken.
   1688 
   1689       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
   1690       Ext = ISD::SEXTLOAD;
   1691     }
   1692 
   1693     // Compute the offset from the value.
   1694     // XXX - I think PartOffset should give you this, but it seems to give the
   1695     // size of the register which isn't useful.
   1696 
   1697     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
   1698     unsigned PartOffset = VA.getLocMemOffset();
   1699     unsigned Offset = 36 + VA.getLocMemOffset();
   1700 
   1701     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
   1702     SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
   1703                               DAG.getConstant(Offset, DL, MVT::i32),
   1704                               DAG.getUNDEF(MVT::i32),
   1705                               PtrInfo,
   1706                               MemVT, false, true, true, 4);
   1707 
   1708     // 4 is the preferred alignment for the CONSTANT memory space.
   1709     InVals.push_back(Arg);
   1710     MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
   1711   }
   1712   return Chain;
   1713 }
   1714 
   1715 EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
   1716                                            EVT VT) const {
   1717    if (!VT.isVector())
   1718      return MVT::i32;
   1719    return VT.changeVectorElementTypeToInteger();
   1720 }
   1721 
   1722 static SDValue CompactSwizzlableVector(
   1723   SelectionDAG &DAG, SDValue VectorEntry,
   1724   DenseMap<unsigned, unsigned> &RemapSwizzle) {
   1725   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
   1726   assert(RemapSwizzle.empty());
   1727   SDValue NewBldVec[4] = {
   1728     VectorEntry.getOperand(0),
   1729     VectorEntry.getOperand(1),
   1730     VectorEntry.getOperand(2),
   1731     VectorEntry.getOperand(3)
   1732   };
   1733 
   1734   for (unsigned i = 0; i < 4; i++) {
   1735     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
   1736       // We mask write here to teach later passes that the ith element of this
   1737       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
   1738       // break false dependencies and additionnaly make assembly easier to read.
   1739       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
   1740     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
   1741       if (C->isZero()) {
   1742         RemapSwizzle[i] = 4; // SEL_0
   1743         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
   1744       } else if (C->isExactlyValue(1.0)) {
   1745         RemapSwizzle[i] = 5; // SEL_1
   1746         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
   1747       }
   1748     }
   1749 
   1750     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
   1751       continue;
   1752     for (unsigned j = 0; j < i; j++) {
   1753       if (NewBldVec[i] == NewBldVec[j]) {
   1754         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
   1755         RemapSwizzle[i] = j;
   1756         break;
   1757       }
   1758     }
   1759   }
   1760 
   1761   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
   1762                      VectorEntry.getValueType(), NewBldVec);
   1763 }
   1764 
   1765 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
   1766                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
   1767   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
   1768   assert(RemapSwizzle.empty());
   1769   SDValue NewBldVec[4] = {
   1770       VectorEntry.getOperand(0),
   1771       VectorEntry.getOperand(1),
   1772       VectorEntry.getOperand(2),
   1773       VectorEntry.getOperand(3)
   1774   };
   1775   bool isUnmovable[4] = { false, false, false, false };
   1776   for (unsigned i = 0; i < 4; i++) {
   1777     RemapSwizzle[i] = i;
   1778     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
   1779       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
   1780           ->getZExtValue();
   1781       if (i == Idx)
   1782         isUnmovable[Idx] = true;
   1783     }
   1784   }
   1785 
   1786   for (unsigned i = 0; i < 4; i++) {
   1787     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
   1788       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
   1789           ->getZExtValue();
   1790       if (isUnmovable[Idx])
   1791         continue;
   1792       // Swap i and Idx
   1793       std::swap(NewBldVec[Idx], NewBldVec[i]);
   1794       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
   1795       break;
   1796     }
   1797   }
   1798 
   1799   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
   1800                      VectorEntry.getValueType(), NewBldVec);
   1801 }
   1802 
   1803 
   1804 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
   1805                                             SDValue Swz[4], SelectionDAG &DAG,
   1806                                             SDLoc DL) const {
   1807   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
   1808   // Old -> New swizzle values
   1809   DenseMap<unsigned, unsigned> SwizzleRemap;
   1810 
   1811   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
   1812   for (unsigned i = 0; i < 4; i++) {
   1813     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
   1814     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
   1815       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
   1816   }
   1817 
   1818   SwizzleRemap.clear();
   1819   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
   1820   for (unsigned i = 0; i < 4; i++) {
   1821     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
   1822     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
   1823       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
   1824   }
   1825 
   1826   return BuildVector;
   1827 }
   1828 
   1829 
   1830 //===----------------------------------------------------------------------===//
   1831 // Custom DAG Optimizations
   1832 //===----------------------------------------------------------------------===//
   1833 
   1834 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
   1835                                               DAGCombinerInfo &DCI) const {
   1836   SelectionDAG &DAG = DCI.DAG;
   1837 
   1838   switch (N->getOpcode()) {
   1839   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
   1840   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
   1841   case ISD::FP_ROUND: {
   1842       SDValue Arg = N->getOperand(0);
   1843       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
   1844         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
   1845                            Arg.getOperand(0));
   1846       }
   1847       break;
   1848     }
   1849 
   1850   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
   1851   // (i32 select_cc f32, f32, -1, 0 cc)
   1852   //
   1853   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
   1854   // this to one of the SET*_DX10 instructions.
   1855   case ISD::FP_TO_SINT: {
   1856     SDValue FNeg = N->getOperand(0);
   1857     if (FNeg.getOpcode() != ISD::FNEG) {
   1858       return SDValue();
   1859     }
   1860     SDValue SelectCC = FNeg.getOperand(0);
   1861     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
   1862         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
   1863         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
   1864         !isHWTrueValue(SelectCC.getOperand(2)) ||
   1865         !isHWFalseValue(SelectCC.getOperand(3))) {
   1866       return SDValue();
   1867     }
   1868 
   1869     SDLoc dl(N);
   1870     return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0),
   1871                            SelectCC.getOperand(0), // LHS
   1872                            SelectCC.getOperand(1), // RHS
   1873                            DAG.getConstant(-1, dl, MVT::i32), // True
   1874                            DAG.getConstant(0, dl, MVT::i32),  // False
   1875                            SelectCC.getOperand(4)); // CC
   1876 
   1877     break;
   1878   }
   1879 
   1880   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
   1881   // => build_vector elt0, ... , NewEltIdx, ... , eltN
   1882   case ISD::INSERT_VECTOR_ELT: {
   1883     SDValue InVec = N->getOperand(0);
   1884     SDValue InVal = N->getOperand(1);
   1885     SDValue EltNo = N->getOperand(2);
   1886     SDLoc dl(N);
   1887 
   1888     // If the inserted element is an UNDEF, just use the input vector.
   1889     if (InVal.getOpcode() == ISD::UNDEF)
   1890       return InVec;
   1891 
   1892     EVT VT = InVec.getValueType();
   1893 
   1894     // If we can't generate a legal BUILD_VECTOR, exit
   1895     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
   1896       return SDValue();
   1897 
   1898     // Check that we know which element is being inserted
   1899     if (!isa<ConstantSDNode>(EltNo))
   1900       return SDValue();
   1901     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
   1902 
   1903     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
   1904     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
   1905     // vector elements.
   1906     SmallVector<SDValue, 8> Ops;
   1907     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
   1908       Ops.append(InVec.getNode()->op_begin(),
   1909                  InVec.getNode()->op_end());
   1910     } else if (InVec.getOpcode() == ISD::UNDEF) {
   1911       unsigned NElts = VT.getVectorNumElements();
   1912       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
   1913     } else {
   1914       return SDValue();
   1915     }
   1916 
   1917     // Insert the element
   1918     if (Elt < Ops.size()) {
   1919       // All the operands of BUILD_VECTOR must have the same type;
   1920       // we enforce that here.
   1921       EVT OpVT = Ops[0].getValueType();
   1922       if (InVal.getValueType() != OpVT)
   1923         InVal = OpVT.bitsGT(InVal.getValueType()) ?
   1924           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
   1925           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
   1926       Ops[Elt] = InVal;
   1927     }
   1928 
   1929     // Return the new vector
   1930     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   1931   }
   1932 
   1933   // Extract_vec (Build_vector) generated by custom lowering
   1934   // also needs to be customly combined
   1935   case ISD::EXTRACT_VECTOR_ELT: {
   1936     SDValue Arg = N->getOperand(0);
   1937     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
   1938       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
   1939         unsigned Element = Const->getZExtValue();
   1940         return Arg->getOperand(Element);
   1941       }
   1942     }
   1943     if (Arg.getOpcode() == ISD::BITCAST &&
   1944         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
   1945       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
   1946         unsigned Element = Const->getZExtValue();
   1947         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
   1948             Arg->getOperand(0).getOperand(Element));
   1949       }
   1950     }
   1951     break;
   1952   }
   1953 
   1954   case ISD::SELECT_CC: {
   1955     // Try common optimizations
   1956     SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
   1957     if (Ret.getNode())
   1958       return Ret;
   1959 
   1960     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
   1961     //      selectcc x, y, a, b, inv(cc)
   1962     //
   1963     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
   1964     //      selectcc x, y, a, b, cc
   1965     SDValue LHS = N->getOperand(0);
   1966     if (LHS.getOpcode() != ISD::SELECT_CC) {
   1967       return SDValue();
   1968     }
   1969 
   1970     SDValue RHS = N->getOperand(1);
   1971     SDValue True = N->getOperand(2);
   1972     SDValue False = N->getOperand(3);
   1973     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
   1974 
   1975     if (LHS.getOperand(2).getNode() != True.getNode() ||
   1976         LHS.getOperand(3).getNode() != False.getNode() ||
   1977         RHS.getNode() != False.getNode()) {
   1978       return SDValue();
   1979     }
   1980 
   1981     switch (NCC) {
   1982     default: return SDValue();
   1983     case ISD::SETNE: return LHS;
   1984     case ISD::SETEQ: {
   1985       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
   1986       LHSCC = ISD::getSetCCInverse(LHSCC,
   1987                                   LHS.getOperand(0).getValueType().isInteger());
   1988       if (DCI.isBeforeLegalizeOps() ||
   1989           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
   1990         return DAG.getSelectCC(SDLoc(N),
   1991                                LHS.getOperand(0),
   1992                                LHS.getOperand(1),
   1993                                LHS.getOperand(2),
   1994                                LHS.getOperand(3),
   1995                                LHSCC);
   1996       break;
   1997     }
   1998     }
   1999     return SDValue();
   2000   }
   2001 
   2002   case AMDGPUISD::EXPORT: {
   2003     SDValue Arg = N->getOperand(1);
   2004     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
   2005       break;
   2006 
   2007     SDValue NewArgs[8] = {
   2008       N->getOperand(0), // Chain
   2009       SDValue(),
   2010       N->getOperand(2), // ArrayBase
   2011       N->getOperand(3), // Type
   2012       N->getOperand(4), // SWZ_X
   2013       N->getOperand(5), // SWZ_Y
   2014       N->getOperand(6), // SWZ_Z
   2015       N->getOperand(7) // SWZ_W
   2016     };
   2017     SDLoc DL(N);
   2018     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
   2019     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
   2020   }
   2021   case AMDGPUISD::TEXTURE_FETCH: {
   2022     SDValue Arg = N->getOperand(1);
   2023     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
   2024       break;
   2025 
   2026     SDValue NewArgs[19] = {
   2027       N->getOperand(0),
   2028       N->getOperand(1),
   2029       N->getOperand(2),
   2030       N->getOperand(3),
   2031       N->getOperand(4),
   2032       N->getOperand(5),
   2033       N->getOperand(6),
   2034       N->getOperand(7),
   2035       N->getOperand(8),
   2036       N->getOperand(9),
   2037       N->getOperand(10),
   2038       N->getOperand(11),
   2039       N->getOperand(12),
   2040       N->getOperand(13),
   2041       N->getOperand(14),
   2042       N->getOperand(15),
   2043       N->getOperand(16),
   2044       N->getOperand(17),
   2045       N->getOperand(18),
   2046     };
   2047     SDLoc DL(N);
   2048     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
   2049     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
   2050   }
   2051   }
   2052 
   2053   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
   2054 }
   2055 
   2056 static bool
   2057 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
   2058             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
   2059   const R600InstrInfo *TII =
   2060       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
   2061   if (!Src.isMachineOpcode())
   2062     return false;
   2063   switch (Src.getMachineOpcode()) {
   2064   case AMDGPU::FNEG_R600:
   2065     if (!Neg.getNode())
   2066       return false;
   2067     Src = Src.getOperand(0);
   2068     Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
   2069     return true;
   2070   case AMDGPU::FABS_R600:
   2071     if (!Abs.getNode())
   2072       return false;
   2073     Src = Src.getOperand(0);
   2074     Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
   2075     return true;
   2076   case AMDGPU::CONST_COPY: {
   2077     unsigned Opcode = ParentNode->getMachineOpcode();
   2078     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
   2079 
   2080     if (!Sel.getNode())
   2081       return false;
   2082 
   2083     SDValue CstOffset = Src.getOperand(0);
   2084     if (ParentNode->getValueType(0).isVector())
   2085       return false;
   2086 
   2087     // Gather constants values
   2088     int SrcIndices[] = {
   2089       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
   2090       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
   2091       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
   2092       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
   2093       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
   2094       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
   2095       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
   2096       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
   2097       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
   2098       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
   2099       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
   2100     };
   2101     std::vector<unsigned> Consts;
   2102     for (int OtherSrcIdx : SrcIndices) {
   2103       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
   2104       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
   2105         continue;
   2106       if (HasDst) {
   2107         OtherSrcIdx--;
   2108         OtherSelIdx--;
   2109       }
   2110       if (RegisterSDNode *Reg =
   2111           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
   2112         if (Reg->getReg() == AMDGPU::ALU_CONST) {
   2113           ConstantSDNode *Cst
   2114             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
   2115           Consts.push_back(Cst->getZExtValue());
   2116         }
   2117       }
   2118     }
   2119 
   2120     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
   2121     Consts.push_back(Cst->getZExtValue());
   2122     if (!TII->fitsConstReadLimitations(Consts)) {
   2123       return false;
   2124     }
   2125 
   2126     Sel = CstOffset;
   2127     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
   2128     return true;
   2129   }
   2130   case AMDGPU::MOV_IMM_I32:
   2131   case AMDGPU::MOV_IMM_F32: {
   2132     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
   2133     uint64_t ImmValue = 0;
   2134 
   2135 
   2136     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
   2137       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
   2138       float FloatValue = FPC->getValueAPF().convertToFloat();
   2139       if (FloatValue == 0.0) {
   2140         ImmReg = AMDGPU::ZERO;
   2141       } else if (FloatValue == 0.5) {
   2142         ImmReg = AMDGPU::HALF;
   2143       } else if (FloatValue == 1.0) {
   2144         ImmReg = AMDGPU::ONE;
   2145       } else {
   2146         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
   2147       }
   2148     } else {
   2149       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
   2150       uint64_t Value = C->getZExtValue();
   2151       if (Value == 0) {
   2152         ImmReg = AMDGPU::ZERO;
   2153       } else if (Value == 1) {
   2154         ImmReg = AMDGPU::ONE_INT;
   2155       } else {
   2156         ImmValue = Value;
   2157       }
   2158     }
   2159 
   2160     // Check that we aren't already using an immediate.
   2161     // XXX: It's possible for an instruction to have more than one
   2162     // immediate operand, but this is not supported yet.
   2163     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
   2164       if (!Imm.getNode())
   2165         return false;
   2166       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
   2167       assert(C);
   2168       if (C->getZExtValue())
   2169         return false;
   2170       Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
   2171     }
   2172     Src = DAG.getRegister(ImmReg, MVT::i32);
   2173     return true;
   2174   }
   2175   default:
   2176     return false;
   2177   }
   2178 }
   2179 
   2180 
   2181 /// \brief Fold the instructions after selecting them
   2182 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
   2183                                             SelectionDAG &DAG) const {
   2184   const R600InstrInfo *TII =
   2185       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
   2186   if (!Node->isMachineOpcode())
   2187     return Node;
   2188   unsigned Opcode = Node->getMachineOpcode();
   2189   SDValue FakeOp;
   2190 
   2191   std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
   2192 
   2193   if (Opcode == AMDGPU::DOT_4) {
   2194     int OperandIdx[] = {
   2195       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
   2196       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
   2197       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
   2198       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
   2199       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
   2200       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
   2201       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
   2202       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
   2203         };
   2204     int NegIdx[] = {
   2205       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
   2206       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
   2207       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
   2208       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
   2209       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
   2210       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
   2211       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
   2212       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
   2213     };
   2214     int AbsIdx[] = {
   2215       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
   2216       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
   2217       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
   2218       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
   2219       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
   2220       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
   2221       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
   2222       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
   2223     };
   2224     for (unsigned i = 0; i < 8; i++) {
   2225       if (OperandIdx[i] < 0)
   2226         return Node;
   2227       SDValue &Src = Ops[OperandIdx[i] - 1];
   2228       SDValue &Neg = Ops[NegIdx[i] - 1];
   2229       SDValue &Abs = Ops[AbsIdx[i] - 1];
   2230       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
   2231       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
   2232       if (HasDst)
   2233         SelIdx--;
   2234       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
   2235       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
   2236         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
   2237     }
   2238   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
   2239     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
   2240       SDValue &Src = Ops[i];
   2241       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
   2242         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
   2243     }
   2244   } else if (Opcode == AMDGPU::CLAMP_R600) {
   2245     SDValue Src = Node->getOperand(0);
   2246     if (!Src.isMachineOpcode() ||
   2247         !TII->hasInstrModifiers(Src.getMachineOpcode()))
   2248       return Node;
   2249     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
   2250         AMDGPU::OpName::clamp);
   2251     if (ClampIdx < 0)
   2252       return Node;
   2253     SDLoc DL(Node);
   2254     std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
   2255     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
   2256     return DAG.getMachineNode(Src.getMachineOpcode(), DL,
   2257                               Node->getVTList(), Ops);
   2258   } else {
   2259     if (!TII->hasInstrModifiers(Opcode))
   2260       return Node;
   2261     int OperandIdx[] = {
   2262       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
   2263       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
   2264       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
   2265     };
   2266     int NegIdx[] = {
   2267       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
   2268       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
   2269       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
   2270     };
   2271     int AbsIdx[] = {
   2272       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
   2273       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
   2274       -1
   2275     };
   2276     for (unsigned i = 0; i < 3; i++) {
   2277       if (OperandIdx[i] < 0)
   2278         return Node;
   2279       SDValue &Src = Ops[OperandIdx[i] - 1];
   2280       SDValue &Neg = Ops[NegIdx[i] - 1];
   2281       SDValue FakeAbs;
   2282       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
   2283       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
   2284       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
   2285       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
   2286       if (HasDst) {
   2287         SelIdx--;
   2288         ImmIdx--;
   2289       }
   2290       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
   2291       SDValue &Imm = Ops[ImmIdx];
   2292       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
   2293         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
   2294     }
   2295   }
   2296 
   2297   return Node;
   2298 }
   2299