Home | History | Annotate | Download | only in AMDGPU
      1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 /// \brief Custom DAG lowering for R600
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "R600ISelLowering.h"
     16 #include "AMDGPUFrameLowering.h"
     17 #include "AMDGPUIntrinsicInfo.h"
     18 #include "AMDGPUSubtarget.h"
     19 #include "R600Defines.h"
     20 #include "R600InstrInfo.h"
     21 #include "R600MachineFunctionInfo.h"
     22 #include "llvm/Analysis/ValueTracking.h"
     23 #include "llvm/CodeGen/CallingConvLower.h"
     24 #include "llvm/CodeGen/MachineFrameInfo.h"
     25 #include "llvm/CodeGen/MachineInstrBuilder.h"
     26 #include "llvm/CodeGen/MachineRegisterInfo.h"
     27 #include "llvm/CodeGen/SelectionDAG.h"
     28 #include "llvm/IR/Argument.h"
     29 #include "llvm/IR/Function.h"
     30 
     31 using namespace llvm;
     32 
     33 R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
     34                                        const R600Subtarget &STI)
     35     : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
     36   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
     37   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
     38   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
     39   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
     40   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
     41   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
     42 
     43   computeRegisterProperties(STI.getRegisterInfo());
     44 
     45   // Legalize loads and stores to the private address space.
     46   setOperationAction(ISD::LOAD, MVT::i32, Custom);
     47   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
     48   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
     49 
     50   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
     51   // spaces, so it is custom lowered to handle those where it isn't.
     52   for (MVT VT : MVT::integer_valuetypes()) {
     53     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
     54     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
     55     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
     56 
     57     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
     58     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
     59     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
     60 
     61     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
     62     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
     63     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
     64   }
     65 
     66   // Workaround for LegalizeDAG asserting on expansion of i1 vector loads.
     67   setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
     68   setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
     69   setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
     70 
     71   setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
     72   setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
     73   setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
     74 
     75 
     76   setOperationAction(ISD::STORE, MVT::i8, Custom);
     77   setOperationAction(ISD::STORE, MVT::i32, Custom);
     78   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
     79   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
     80 
     81   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
     82   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
     83 
     84   // Workaround for LegalizeDAG asserting on expansion of i1 vector stores.
     85   setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand);
     86   setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand);
     87 
     88   // Set condition code actions
     89   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
     90   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
     91   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
     92   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
     93   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
     94   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
     95   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
     96   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
     97   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
     98   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
     99   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
    100   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
    101 
    102   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
    103   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
    104   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
    105   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
    106 
    107   setOperationAction(ISD::FCOS, MVT::f32, Custom);
    108   setOperationAction(ISD::FSIN, MVT::f32, Custom);
    109 
    110   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
    111   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
    112 
    113   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
    114   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
    115   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
    116 
    117   setOperationAction(ISD::FSUB, MVT::f32, Expand);
    118 
    119   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
    120   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
    121 
    122   setOperationAction(ISD::SETCC, MVT::i32, Expand);
    123   setOperationAction(ISD::SETCC, MVT::f32, Expand);
    124   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
    125   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
    126   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
    127 
    128   setOperationAction(ISD::SELECT, MVT::i32, Expand);
    129   setOperationAction(ISD::SELECT, MVT::f32, Expand);
    130   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
    131   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
    132 
    133   // ADD, SUB overflow.
    134   // TODO: turn these into Legal?
    135   if (Subtarget->hasCARRY())
    136     setOperationAction(ISD::UADDO, MVT::i32, Custom);
    137 
    138   if (Subtarget->hasBORROW())
    139     setOperationAction(ISD::USUBO, MVT::i32, Custom);
    140 
    141   // Expand sign extension of vectors
    142   if (!Subtarget->hasBFE())
    143     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
    144 
    145   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
    146   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
    147 
    148   if (!Subtarget->hasBFE())
    149     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
    150   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
    151   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
    152 
    153   if (!Subtarget->hasBFE())
    154     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
    155   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
    156   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
    157 
    158   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
    159   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
    160   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
    161 
    162   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
    163 
    164   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
    165 
    166   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
    167   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
    168   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
    169   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
    170 
    171   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
    172   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
    173   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
    174   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
    175 
    176   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
    177   //  to be Legal/Custom in order to avoid library calls.
    178   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
    179   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
    180   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
    181 
    182   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
    183 
    184   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
    185   for (MVT VT : ScalarIntVTs) {
    186     setOperationAction(ISD::ADDC, VT, Expand);
    187     setOperationAction(ISD::SUBC, VT, Expand);
    188     setOperationAction(ISD::ADDE, VT, Expand);
    189     setOperationAction(ISD::SUBE, VT, Expand);
    190   }
    191 
    192   setSchedulingPreference(Sched::Source);
    193 
    194 
    195   setTargetDAGCombine(ISD::FP_ROUND);
    196   setTargetDAGCombine(ISD::FP_TO_SINT);
    197   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
    198   setTargetDAGCombine(ISD::SELECT_CC);
    199   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
    200 }
    201 
    202 const R600Subtarget *R600TargetLowering::getSubtarget() const {
    203   return static_cast<const R600Subtarget *>(Subtarget);
    204 }
    205 
    206 static inline bool isEOP(MachineBasicBlock::iterator I) {
    207   return std::next(I)->getOpcode() == AMDGPU::RETURN;
    208 }
    209 
    210 MachineBasicBlock *
    211 R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
    212                                                 MachineBasicBlock *BB) const {
    213   MachineFunction * MF = BB->getParent();
    214   MachineRegisterInfo &MRI = MF->getRegInfo();
    215   MachineBasicBlock::iterator I = MI;
    216   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
    217 
    218   switch (MI.getOpcode()) {
    219   default:
    220     // Replace LDS_*_RET instruction that don't have any uses with the
    221     // equivalent LDS_*_NORET instruction.
    222     if (TII->isLDSRetInstr(MI.getOpcode())) {
    223       int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
    224       assert(DstIdx != -1);
    225       MachineInstrBuilder NewMI;
    226       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
    227       //        LDS_1A2D support and remove this special case.
    228       if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) ||
    229           MI.getOpcode() == AMDGPU::LDS_CMPST_RET)
    230         return BB;
    231 
    232       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
    233                       TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode())));
    234       for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) {
    235         NewMI.addOperand(MI.getOperand(i));
    236       }
    237     } else {
    238       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
    239     }
    240     break;
    241   case AMDGPU::CLAMP_R600: {
    242     MachineInstr *NewMI = TII->buildDefaultInstruction(
    243         *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
    244         MI.getOperand(1).getReg());
    245     TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP);
    246     break;
    247   }
    248 
    249   case AMDGPU::FABS_R600: {
    250     MachineInstr *NewMI = TII->buildDefaultInstruction(
    251         *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
    252         MI.getOperand(1).getReg());
    253     TII->addFlag(*NewMI, 0, MO_FLAG_ABS);
    254     break;
    255   }
    256 
    257   case AMDGPU::FNEG_R600: {
    258     MachineInstr *NewMI = TII->buildDefaultInstruction(
    259         *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
    260         MI.getOperand(1).getReg());
    261     TII->addFlag(*NewMI, 0, MO_FLAG_NEG);
    262     break;
    263   }
    264 
    265   case AMDGPU::MASK_WRITE: {
    266     unsigned maskedRegister = MI.getOperand(0).getReg();
    267     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
    268     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
    269     TII->addFlag(*defInstr, 0, MO_FLAG_MASK);
    270     break;
    271   }
    272 
    273   case AMDGPU::MOV_IMM_F32:
    274     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1)
    275                                                             .getFPImm()
    276                                                             ->getValueAPF()
    277                                                             .bitcastToAPInt()
    278                                                             .getZExtValue());
    279     break;
    280   case AMDGPU::MOV_IMM_I32:
    281     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(),
    282                      MI.getOperand(1).getImm());
    283     break;
    284   case AMDGPU::MOV_IMM_GLOBAL_ADDR: {
    285     //TODO: Perhaps combine this instruction with the next if possible
    286     auto MIB = TII->buildDefaultInstruction(
    287         *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X);
    288     int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal);
    289     //TODO: Ugh this is rather ugly
    290     MIB->getOperand(Idx) = MI.getOperand(1);
    291     break;
    292   }
    293   case AMDGPU::CONST_COPY: {
    294     MachineInstr *NewMI = TII->buildDefaultInstruction(
    295         *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
    296     TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel,
    297                        MI.getOperand(1).getImm());
    298     break;
    299   }
    300 
    301   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
    302   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
    303   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
    304     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
    305         .addOperand(MI.getOperand(0))
    306         .addOperand(MI.getOperand(1))
    307         .addImm(isEOP(I)); // Set End of program bit
    308     break;
    309   }
    310   case AMDGPU::RAT_STORE_TYPED_eg: {
    311     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
    312         .addOperand(MI.getOperand(0))
    313         .addOperand(MI.getOperand(1))
    314         .addOperand(MI.getOperand(2))
    315         .addImm(isEOP(I)); // Set End of program bit
    316     break;
    317   }
    318 
    319   case AMDGPU::TXD: {
    320     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
    321     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
    322     MachineOperand &RID = MI.getOperand(4);
    323     MachineOperand &SID = MI.getOperand(5);
    324     unsigned TextureId = MI.getOperand(6).getImm();
    325     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
    326     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
    327 
    328     switch (TextureId) {
    329     case 5: // Rect
    330       CTX = CTY = 0;
    331       break;
    332     case 6: // Shadow1D
    333       SrcW = SrcZ;
    334       break;
    335     case 7: // Shadow2D
    336       SrcW = SrcZ;
    337       break;
    338     case 8: // ShadowRect
    339       CTX = CTY = 0;
    340       SrcW = SrcZ;
    341       break;
    342     case 9: // 1DArray
    343       SrcZ = SrcY;
    344       CTZ = 0;
    345       break;
    346     case 10: // 2DArray
    347       CTZ = 0;
    348       break;
    349     case 11: // Shadow1DArray
    350       SrcZ = SrcY;
    351       CTZ = 0;
    352       break;
    353     case 12: // Shadow2DArray
    354       CTZ = 0;
    355       break;
    356     }
    357     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H),
    358             T0)
    359         .addOperand(MI.getOperand(3))
    360         .addImm(SrcX)
    361         .addImm(SrcY)
    362         .addImm(SrcZ)
    363         .addImm(SrcW)
    364         .addImm(0)
    365         .addImm(0)
    366         .addImm(0)
    367         .addImm(0)
    368         .addImm(1)
    369         .addImm(2)
    370         .addImm(3)
    371         .addOperand(RID)
    372         .addOperand(SID)
    373         .addImm(CTX)
    374         .addImm(CTY)
    375         .addImm(CTZ)
    376         .addImm(CTW);
    377     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V),
    378             T1)
    379         .addOperand(MI.getOperand(2))
    380         .addImm(SrcX)
    381         .addImm(SrcY)
    382         .addImm(SrcZ)
    383         .addImm(SrcW)
    384         .addImm(0)
    385         .addImm(0)
    386         .addImm(0)
    387         .addImm(0)
    388         .addImm(1)
    389         .addImm(2)
    390         .addImm(3)
    391         .addOperand(RID)
    392         .addOperand(SID)
    393         .addImm(CTX)
    394         .addImm(CTY)
    395         .addImm(CTZ)
    396         .addImm(CTW);
    397     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
    398         .addOperand(MI.getOperand(0))
    399         .addOperand(MI.getOperand(1))
    400         .addImm(SrcX)
    401         .addImm(SrcY)
    402         .addImm(SrcZ)
    403         .addImm(SrcW)
    404         .addImm(0)
    405         .addImm(0)
    406         .addImm(0)
    407         .addImm(0)
    408         .addImm(1)
    409         .addImm(2)
    410         .addImm(3)
    411         .addOperand(RID)
    412         .addOperand(SID)
    413         .addImm(CTX)
    414         .addImm(CTY)
    415         .addImm(CTZ)
    416         .addImm(CTW)
    417         .addReg(T0, RegState::Implicit)
    418         .addReg(T1, RegState::Implicit);
    419     break;
    420   }
    421 
    422   case AMDGPU::TXD_SHADOW: {
    423     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
    424     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
    425     MachineOperand &RID = MI.getOperand(4);
    426     MachineOperand &SID = MI.getOperand(5);
    427     unsigned TextureId = MI.getOperand(6).getImm();
    428     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
    429     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
    430 
    431     switch (TextureId) {
    432     case 5: // Rect
    433       CTX = CTY = 0;
    434       break;
    435     case 6: // Shadow1D
    436       SrcW = SrcZ;
    437       break;
    438     case 7: // Shadow2D
    439       SrcW = SrcZ;
    440       break;
    441     case 8: // ShadowRect
    442       CTX = CTY = 0;
    443       SrcW = SrcZ;
    444       break;
    445     case 9: // 1DArray
    446       SrcZ = SrcY;
    447       CTZ = 0;
    448       break;
    449     case 10: // 2DArray
    450       CTZ = 0;
    451       break;
    452     case 11: // Shadow1DArray
    453       SrcZ = SrcY;
    454       CTZ = 0;
    455       break;
    456     case 12: // Shadow2DArray
    457       CTZ = 0;
    458       break;
    459     }
    460 
    461     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H),
    462             T0)
    463         .addOperand(MI.getOperand(3))
    464         .addImm(SrcX)
    465         .addImm(SrcY)
    466         .addImm(SrcZ)
    467         .addImm(SrcW)
    468         .addImm(0)
    469         .addImm(0)
    470         .addImm(0)
    471         .addImm(0)
    472         .addImm(1)
    473         .addImm(2)
    474         .addImm(3)
    475         .addOperand(RID)
    476         .addOperand(SID)
    477         .addImm(CTX)
    478         .addImm(CTY)
    479         .addImm(CTZ)
    480         .addImm(CTW);
    481     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V),
    482             T1)
    483         .addOperand(MI.getOperand(2))
    484         .addImm(SrcX)
    485         .addImm(SrcY)
    486         .addImm(SrcZ)
    487         .addImm(SrcW)
    488         .addImm(0)
    489         .addImm(0)
    490         .addImm(0)
    491         .addImm(0)
    492         .addImm(1)
    493         .addImm(2)
    494         .addImm(3)
    495         .addOperand(RID)
    496         .addOperand(SID)
    497         .addImm(CTX)
    498         .addImm(CTY)
    499         .addImm(CTZ)
    500         .addImm(CTW);
    501     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
    502         .addOperand(MI.getOperand(0))
    503         .addOperand(MI.getOperand(1))
    504         .addImm(SrcX)
    505         .addImm(SrcY)
    506         .addImm(SrcZ)
    507         .addImm(SrcW)
    508         .addImm(0)
    509         .addImm(0)
    510         .addImm(0)
    511         .addImm(0)
    512         .addImm(1)
    513         .addImm(2)
    514         .addImm(3)
    515         .addOperand(RID)
    516         .addOperand(SID)
    517         .addImm(CTX)
    518         .addImm(CTY)
    519         .addImm(CTZ)
    520         .addImm(CTW)
    521         .addReg(T0, RegState::Implicit)
    522         .addReg(T1, RegState::Implicit);
    523     break;
    524   }
    525 
    526   case AMDGPU::BRANCH:
    527     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
    528         .addOperand(MI.getOperand(0));
    529     break;
    530 
    531   case AMDGPU::BRANCH_COND_f32: {
    532     MachineInstr *NewMI =
    533         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
    534                 AMDGPU::PREDICATE_BIT)
    535             .addOperand(MI.getOperand(1))
    536             .addImm(OPCODE_IS_NOT_ZERO)
    537             .addImm(0); // Flags
    538     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
    539     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
    540         .addOperand(MI.getOperand(0))
    541         .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
    542     break;
    543   }
    544 
    545   case AMDGPU::BRANCH_COND_i32: {
    546     MachineInstr *NewMI =
    547         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
    548                 AMDGPU::PREDICATE_BIT)
    549             .addOperand(MI.getOperand(1))
    550             .addImm(OPCODE_IS_NOT_ZERO_INT)
    551             .addImm(0); // Flags
    552     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
    553     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
    554         .addOperand(MI.getOperand(0))
    555         .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
    556     break;
    557   }
    558 
    559   case AMDGPU::EG_ExportSwz:
    560   case AMDGPU::R600_ExportSwz: {
    561     // Instruction is left unmodified if its not the last one of its type
    562     bool isLastInstructionOfItsType = true;
    563     unsigned InstExportType = MI.getOperand(1).getImm();
    564     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
    565          EndBlock = BB->end(); NextExportInst != EndBlock;
    566          NextExportInst = std::next(NextExportInst)) {
    567       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
    568           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
    569         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
    570             .getImm();
    571         if (CurrentInstExportType == InstExportType) {
    572           isLastInstructionOfItsType = false;
    573           break;
    574         }
    575       }
    576     }
    577     bool EOP = isEOP(I);
    578     if (!EOP && !isLastInstructionOfItsType)
    579       return BB;
    580     unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40;
    581     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
    582         .addOperand(MI.getOperand(0))
    583         .addOperand(MI.getOperand(1))
    584         .addOperand(MI.getOperand(2))
    585         .addOperand(MI.getOperand(3))
    586         .addOperand(MI.getOperand(4))
    587         .addOperand(MI.getOperand(5))
    588         .addOperand(MI.getOperand(6))
    589         .addImm(CfInst)
    590         .addImm(EOP);
    591     break;
    592   }
    593   case AMDGPU::RETURN: {
    594     // RETURN instructions must have the live-out registers as implicit uses,
    595     // otherwise they appear dead.
    596     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
    597     MachineInstrBuilder MIB(*MF, MI);
    598     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
    599       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
    600     return BB;
    601   }
    602   }
    603 
    604   MI.eraseFromParent();
    605   return BB;
    606 }
    607 
    608 //===----------------------------------------------------------------------===//
    609 // Custom DAG Lowering Operations
    610 //===----------------------------------------------------------------------===//
    611 
    612 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    613   MachineFunction &MF = DAG.getMachineFunction();
    614   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
    615   switch (Op.getOpcode()) {
    616   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
    617   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
    618   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
    619   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
    620   case ISD::SRA_PARTS:
    621   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
    622   case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
    623   case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
    624   case ISD::FCOS:
    625   case ISD::FSIN: return LowerTrig(Op, DAG);
    626   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
    627   case ISD::STORE: return LowerSTORE(Op, DAG);
    628   case ISD::LOAD: {
    629     SDValue Result = LowerLOAD(Op, DAG);
    630     assert((!Result.getNode() ||
    631             Result.getNode()->getNumValues() == 2) &&
    632            "Load should return a value and a chain");
    633     return Result;
    634   }
    635 
    636   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
    637   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
    638   case ISD::FrameIndex: return lowerFrameIndex(Op, DAG);
    639   case ISD::INTRINSIC_VOID: {
    640     SDValue Chain = Op.getOperand(0);
    641     unsigned IntrinsicID =
    642                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    643     switch (IntrinsicID) {
    644     case AMDGPUIntrinsic::R600_store_swizzle: {
    645       SDLoc DL(Op);
    646       const SDValue Args[8] = {
    647         Chain,
    648         Op.getOperand(2), // Export Value
    649         Op.getOperand(3), // ArrayBase
    650         Op.getOperand(4), // Type
    651         DAG.getConstant(0, DL, MVT::i32), // SWZ_X
    652         DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
    653         DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
    654         DAG.getConstant(3, DL, MVT::i32) // SWZ_W
    655       };
    656       return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args);
    657     }
    658 
    659     // default for switch(IntrinsicID)
    660     default: break;
    661     }
    662     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
    663     break;
    664   }
    665   case ISD::INTRINSIC_WO_CHAIN: {
    666     unsigned IntrinsicID =
    667                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    668     EVT VT = Op.getValueType();
    669     SDLoc DL(Op);
    670     switch(IntrinsicID) {
    671     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
    672     case AMDGPUIntrinsic::r600_tex:
    673     case AMDGPUIntrinsic::r600_texc:
    674     case AMDGPUIntrinsic::r600_txl:
    675     case AMDGPUIntrinsic::r600_txlc:
    676     case AMDGPUIntrinsic::r600_txb:
    677     case AMDGPUIntrinsic::r600_txbc:
    678     case AMDGPUIntrinsic::r600_txf:
    679     case AMDGPUIntrinsic::r600_txq:
    680     case AMDGPUIntrinsic::r600_ddx:
    681     case AMDGPUIntrinsic::r600_ddy: {
    682       unsigned TextureOp;
    683       switch (IntrinsicID) {
    684       case AMDGPUIntrinsic::r600_tex:
    685         TextureOp = 0;
    686         break;
    687       case AMDGPUIntrinsic::r600_texc:
    688         TextureOp = 1;
    689         break;
    690       case AMDGPUIntrinsic::r600_txl:
    691         TextureOp = 2;
    692         break;
    693       case AMDGPUIntrinsic::r600_txlc:
    694         TextureOp = 3;
    695         break;
    696       case AMDGPUIntrinsic::r600_txb:
    697         TextureOp = 4;
    698         break;
    699       case AMDGPUIntrinsic::r600_txbc:
    700         TextureOp = 5;
    701         break;
    702       case AMDGPUIntrinsic::r600_txf:
    703         TextureOp = 6;
    704         break;
    705       case AMDGPUIntrinsic::r600_txq:
    706         TextureOp = 7;
    707         break;
    708       case AMDGPUIntrinsic::r600_ddx:
    709         TextureOp = 8;
    710         break;
    711       case AMDGPUIntrinsic::r600_ddy:
    712         TextureOp = 9;
    713         break;
    714       default:
    715         llvm_unreachable("Unknow Texture Operation");
    716       }
    717 
    718       SDValue TexArgs[19] = {
    719         DAG.getConstant(TextureOp, DL, MVT::i32),
    720         Op.getOperand(1),
    721         DAG.getConstant(0, DL, MVT::i32),
    722         DAG.getConstant(1, DL, MVT::i32),
    723         DAG.getConstant(2, DL, MVT::i32),
    724         DAG.getConstant(3, DL, MVT::i32),
    725         Op.getOperand(2),
    726         Op.getOperand(3),
    727         Op.getOperand(4),
    728         DAG.getConstant(0, DL, MVT::i32),
    729         DAG.getConstant(1, DL, MVT::i32),
    730         DAG.getConstant(2, DL, MVT::i32),
    731         DAG.getConstant(3, DL, MVT::i32),
    732         Op.getOperand(5),
    733         Op.getOperand(6),
    734         Op.getOperand(7),
    735         Op.getOperand(8),
    736         Op.getOperand(9),
    737         Op.getOperand(10)
    738       };
    739       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
    740     }
    741     case AMDGPUIntrinsic::r600_dot4: {
    742       SDValue Args[8] = {
    743       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
    744           DAG.getConstant(0, DL, MVT::i32)),
    745       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
    746           DAG.getConstant(0, DL, MVT::i32)),
    747       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
    748           DAG.getConstant(1, DL, MVT::i32)),
    749       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
    750           DAG.getConstant(1, DL, MVT::i32)),
    751       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
    752           DAG.getConstant(2, DL, MVT::i32)),
    753       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
    754           DAG.getConstant(2, DL, MVT::i32)),
    755       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
    756           DAG.getConstant(3, DL, MVT::i32)),
    757       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
    758           DAG.getConstant(3, DL, MVT::i32))
    759       };
    760       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
    761     }
    762 
    763     case Intrinsic::r600_implicitarg_ptr: {
    764       MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS);
    765       uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
    766       return DAG.getConstant(ByteOffset, DL, PtrVT);
    767     }
    768     case Intrinsic::r600_read_ngroups_x:
    769       return LowerImplicitParameter(DAG, VT, DL, 0);
    770     case Intrinsic::r600_read_ngroups_y:
    771       return LowerImplicitParameter(DAG, VT, DL, 1);
    772     case Intrinsic::r600_read_ngroups_z:
    773       return LowerImplicitParameter(DAG, VT, DL, 2);
    774     case Intrinsic::r600_read_global_size_x:
    775       return LowerImplicitParameter(DAG, VT, DL, 3);
    776     case Intrinsic::r600_read_global_size_y:
    777       return LowerImplicitParameter(DAG, VT, DL, 4);
    778     case Intrinsic::r600_read_global_size_z:
    779       return LowerImplicitParameter(DAG, VT, DL, 5);
    780     case Intrinsic::r600_read_local_size_x:
    781       return LowerImplicitParameter(DAG, VT, DL, 6);
    782     case Intrinsic::r600_read_local_size_y:
    783       return LowerImplicitParameter(DAG, VT, DL, 7);
    784     case Intrinsic::r600_read_local_size_z:
    785       return LowerImplicitParameter(DAG, VT, DL, 8);
    786 
    787     case Intrinsic::r600_read_workdim:
    788     case AMDGPUIntrinsic::AMDGPU_read_workdim: { // Legacy name.
    789       uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM);
    790       return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4);
    791     }
    792 
    793     case Intrinsic::r600_read_tgid_x:
    794       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    795                                   AMDGPU::T1_X, VT);
    796     case Intrinsic::r600_read_tgid_y:
    797       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    798                                   AMDGPU::T1_Y, VT);
    799     case Intrinsic::r600_read_tgid_z:
    800       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    801                                   AMDGPU::T1_Z, VT);
    802     case Intrinsic::r600_read_tidig_x:
    803       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    804                                   AMDGPU::T0_X, VT);
    805     case Intrinsic::r600_read_tidig_y:
    806       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    807                                   AMDGPU::T0_Y, VT);
    808     case Intrinsic::r600_read_tidig_z:
    809       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    810                                   AMDGPU::T0_Z, VT);
    811 
    812     // FIXME: Should be renamed to r600 prefix
    813     case AMDGPUIntrinsic::AMDGPU_rsq_clamped:
    814       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
    815 
    816     case Intrinsic::r600_rsq:
    817     case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name
    818       // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
    819       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
    820     }
    821     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
    822     break;
    823   }
    824   } // end switch(Op.getOpcode())
    825   return SDValue();
    826 }
    827 
    828 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
    829                                             SmallVectorImpl<SDValue> &Results,
    830                                             SelectionDAG &DAG) const {
    831   switch (N->getOpcode()) {
    832   default:
    833     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
    834     return;
    835   case ISD::FP_TO_UINT:
    836     if (N->getValueType(0) == MVT::i1) {
    837       Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
    838       return;
    839     }
    840     // Fall-through. Since we don't care about out of bounds values
    841     // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
    842     // considers some extra cases which are not necessary here.
    843   case ISD::FP_TO_SINT: {
    844     SDValue Result;
    845     if (expandFP_TO_SINT(N, Result, DAG))
    846       Results.push_back(Result);
    847     return;
    848   }
    849   case ISD::SDIVREM: {
    850     SDValue Op = SDValue(N, 1);
    851     SDValue RES = LowerSDIVREM(Op, DAG);
    852     Results.push_back(RES);
    853     Results.push_back(RES.getValue(1));
    854     break;
    855   }
    856   case ISD::UDIVREM: {
    857     SDValue Op = SDValue(N, 0);
    858     LowerUDIVREM64(Op, DAG, Results);
    859     break;
    860   }
    861   }
    862 }
    863 
    864 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
    865                                                    SDValue Vector) const {
    866 
    867   SDLoc DL(Vector);
    868   EVT VecVT = Vector.getValueType();
    869   EVT EltVT = VecVT.getVectorElementType();
    870   SmallVector<SDValue, 8> Args;
    871 
    872   for (unsigned i = 0, e = VecVT.getVectorNumElements();
    873                                                            i != e; ++i) {
    874     Args.push_back(DAG.getNode(
    875         ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
    876         DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
    877   }
    878 
    879   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
    880 }
    881 
    882 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
    883                                                     SelectionDAG &DAG) const {
    884 
    885   SDLoc DL(Op);
    886   SDValue Vector = Op.getOperand(0);
    887   SDValue Index = Op.getOperand(1);
    888 
    889   if (isa<ConstantSDNode>(Index) ||
    890       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
    891     return Op;
    892 
    893   Vector = vectorToVerticalVector(DAG, Vector);
    894   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
    895                      Vector, Index);
    896 }
    897 
    898 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
    899                                                    SelectionDAG &DAG) const {
    900   SDLoc DL(Op);
    901   SDValue Vector = Op.getOperand(0);
    902   SDValue Value = Op.getOperand(1);
    903   SDValue Index = Op.getOperand(2);
    904 
    905   if (isa<ConstantSDNode>(Index) ||
    906       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
    907     return Op;
    908 
    909   Vector = vectorToVerticalVector(DAG, Vector);
    910   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
    911                                Vector, Value, Index);
    912   return vectorToVerticalVector(DAG, Insert);
    913 }
    914 
    915 SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
    916                                                SDValue Op,
    917                                                SelectionDAG &DAG) const {
    918 
    919   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
    920   if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
    921     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
    922 
    923   const DataLayout &DL = DAG.getDataLayout();
    924   const GlobalValue *GV = GSD->getGlobal();
    925   MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
    926 
    927   SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT);
    928   return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA);
    929 }
    930 
    931 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
    932   // On hw >= R700, COS/SIN input must be between -1. and 1.
    933   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
    934   EVT VT = Op.getValueType();
    935   SDValue Arg = Op.getOperand(0);
    936   SDLoc DL(Op);
    937 
    938   // TODO: Should this propagate fast-math-flags?
    939   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
    940       DAG.getNode(ISD::FADD, DL, VT,
    941         DAG.getNode(ISD::FMUL, DL, VT, Arg,
    942           DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
    943         DAG.getConstantFP(0.5, DL, MVT::f32)));
    944   unsigned TrigNode;
    945   switch (Op.getOpcode()) {
    946   case ISD::FCOS:
    947     TrigNode = AMDGPUISD::COS_HW;
    948     break;
    949   case ISD::FSIN:
    950     TrigNode = AMDGPUISD::SIN_HW;
    951     break;
    952   default:
    953     llvm_unreachable("Wrong trig opcode");
    954   }
    955   SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
    956       DAG.getNode(ISD::FADD, DL, VT, FractPart,
    957         DAG.getConstantFP(-0.5, DL, MVT::f32)));
    958   if (Gen >= R600Subtarget::R700)
    959     return TrigVal;
    960   // On R600 hw, COS/SIN input must be between -Pi and Pi.
    961   return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
    962       DAG.getConstantFP(3.14159265359, DL, MVT::f32));
    963 }
    964 
    965 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
    966   SDLoc DL(Op);
    967   EVT VT = Op.getValueType();
    968 
    969   SDValue Lo = Op.getOperand(0);
    970   SDValue Hi = Op.getOperand(1);
    971   SDValue Shift = Op.getOperand(2);
    972   SDValue Zero = DAG.getConstant(0, DL, VT);
    973   SDValue One  = DAG.getConstant(1, DL, VT);
    974 
    975   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
    976   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
    977   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
    978   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
    979 
    980   // The dance around Width1 is necessary for 0 special case.
    981   // Without it the CompShift might be 32, producing incorrect results in
    982   // Overflow. So we do the shift in two steps, the alternative is to
    983   // add a conditional to filter the special case.
    984 
    985   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
    986   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
    987 
    988   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
    989   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
    990   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
    991 
    992   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
    993   SDValue LoBig = Zero;
    994 
    995   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
    996   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
    997 
    998   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
    999 }
   1000 
   1001 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
   1002   SDLoc DL(Op);
   1003   EVT VT = Op.getValueType();
   1004 
   1005   SDValue Lo = Op.getOperand(0);
   1006   SDValue Hi = Op.getOperand(1);
   1007   SDValue Shift = Op.getOperand(2);
   1008   SDValue Zero = DAG.getConstant(0, DL, VT);
   1009   SDValue One  = DAG.getConstant(1, DL, VT);
   1010 
   1011   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
   1012 
   1013   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
   1014   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
   1015   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
   1016   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
   1017 
   1018   // The dance around Width1 is necessary for 0 special case.
   1019   // Without it the CompShift might be 32, producing incorrect results in
   1020   // Overflow. So we do the shift in two steps, the alternative is to
   1021   // add a conditional to filter the special case.
   1022 
   1023   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
   1024   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
   1025 
   1026   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
   1027   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
   1028   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
   1029 
   1030   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
   1031   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
   1032 
   1033   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
   1034   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
   1035 
   1036   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
   1037 }
   1038 
   1039 SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
   1040                                           unsigned mainop, unsigned ovf) const {
   1041   SDLoc DL(Op);
   1042   EVT VT = Op.getValueType();
   1043 
   1044   SDValue Lo = Op.getOperand(0);
   1045   SDValue Hi = Op.getOperand(1);
   1046 
   1047   SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
   1048   // Extend sign.
   1049   OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
   1050                     DAG.getValueType(MVT::i1));
   1051 
   1052   SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
   1053 
   1054   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
   1055 }
   1056 
   1057 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
   1058   SDLoc DL(Op);
   1059   return DAG.getNode(
   1060       ISD::SETCC,
   1061       DL,
   1062       MVT::i1,
   1063       Op, DAG.getConstantFP(0.0f, DL, MVT::f32),
   1064       DAG.getCondCode(ISD::SETNE)
   1065       );
   1066 }
   1067 
   1068 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
   1069                                                    const SDLoc &DL,
   1070                                                    unsigned DwordOffset) const {
   1071   unsigned ByteOffset = DwordOffset * 4;
   1072   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
   1073                                       AMDGPUAS::CONSTANT_BUFFER_0);
   1074 
   1075   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
   1076   assert(isInt<16>(ByteOffset));
   1077 
   1078   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
   1079                      DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
   1080                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
   1081                      false, false, false, 0);
   1082 }
   1083 
   1084 bool R600TargetLowering::isZero(SDValue Op) const {
   1085   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
   1086     return Cst->isNullValue();
   1087   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
   1088     return CstFP->isZero();
   1089   } else {
   1090     return false;
   1091   }
   1092 }
   1093 
   1094 bool R600TargetLowering::isHWTrueValue(SDValue Op) const {
   1095   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
   1096     return CFP->isExactlyValue(1.0);
   1097   }
   1098   return isAllOnesConstant(Op);
   1099 }
   1100 
   1101 bool R600TargetLowering::isHWFalseValue(SDValue Op) const {
   1102   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
   1103     return CFP->getValueAPF().isZero();
   1104   }
   1105   return isNullConstant(Op);
   1106 }
   1107 
   1108 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   1109   SDLoc DL(Op);
   1110   EVT VT = Op.getValueType();
   1111 
   1112   SDValue LHS = Op.getOperand(0);
   1113   SDValue RHS = Op.getOperand(1);
   1114   SDValue True = Op.getOperand(2);
   1115   SDValue False = Op.getOperand(3);
   1116   SDValue CC = Op.getOperand(4);
   1117   SDValue Temp;
   1118 
   1119   if (VT == MVT::f32) {
   1120     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
   1121     SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
   1122     if (MinMax)
   1123       return MinMax;
   1124   }
   1125 
   1126   // LHS and RHS are guaranteed to be the same value type
   1127   EVT CompareVT = LHS.getValueType();
   1128 
   1129   // Check if we can lower this to a native operation.
   1130 
   1131   // Try to lower to a SET* instruction:
   1132   //
   1133   // SET* can match the following patterns:
   1134   //
   1135   // select_cc f32, f32, -1,  0, cc_supported
   1136   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
   1137   // select_cc i32, i32, -1,  0, cc_supported
   1138   //
   1139 
   1140   // Move hardware True/False values to the correct operand.
   1141   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
   1142   ISD::CondCode InverseCC =
   1143      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
   1144   if (isHWTrueValue(False) && isHWFalseValue(True)) {
   1145     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
   1146       std::swap(False, True);
   1147       CC = DAG.getCondCode(InverseCC);
   1148     } else {
   1149       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
   1150       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
   1151         std::swap(False, True);
   1152         std::swap(LHS, RHS);
   1153         CC = DAG.getCondCode(SwapInvCC);
   1154       }
   1155     }
   1156   }
   1157 
   1158   if (isHWTrueValue(True) && isHWFalseValue(False) &&
   1159       (CompareVT == VT || VT == MVT::i32)) {
   1160     // This can be matched by a SET* instruction.
   1161     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
   1162   }
   1163 
   1164   // Try to lower to a CND* instruction:
   1165   //
   1166   // CND* can match the following patterns:
   1167   //
   1168   // select_cc f32, 0.0, f32, f32, cc_supported
   1169   // select_cc f32, 0.0, i32, i32, cc_supported
   1170   // select_cc i32, 0,   f32, f32, cc_supported
   1171   // select_cc i32, 0,   i32, i32, cc_supported
   1172   //
   1173 
   1174   // Try to move the zero value to the RHS
   1175   if (isZero(LHS)) {
   1176     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
   1177     // Try swapping the operands
   1178     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
   1179     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
   1180       std::swap(LHS, RHS);
   1181       CC = DAG.getCondCode(CCSwapped);
   1182     } else {
   1183       // Try inverting the conditon and then swapping the operands
   1184       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
   1185       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
   1186       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
   1187         std::swap(True, False);
   1188         std::swap(LHS, RHS);
   1189         CC = DAG.getCondCode(CCSwapped);
   1190       }
   1191     }
   1192   }
   1193   if (isZero(RHS)) {
   1194     SDValue Cond = LHS;
   1195     SDValue Zero = RHS;
   1196     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
   1197     if (CompareVT != VT) {
   1198       // Bitcast True / False to the correct types.  This will end up being
   1199       // a nop, but it allows us to define only a single pattern in the
   1200       // .TD files for each CND* instruction rather than having to have
   1201       // one pattern for integer True/False and one for fp True/False
   1202       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
   1203       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
   1204     }
   1205 
   1206     switch (CCOpcode) {
   1207     case ISD::SETONE:
   1208     case ISD::SETUNE:
   1209     case ISD::SETNE:
   1210       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
   1211       Temp = True;
   1212       True = False;
   1213       False = Temp;
   1214       break;
   1215     default:
   1216       break;
   1217     }
   1218     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
   1219         Cond, Zero,
   1220         True, False,
   1221         DAG.getCondCode(CCOpcode));
   1222     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
   1223   }
   1224 
   1225   // If we make it this for it means we have no native instructions to handle
   1226   // this SELECT_CC, so we must lower it.
   1227   SDValue HWTrue, HWFalse;
   1228 
   1229   if (CompareVT == MVT::f32) {
   1230     HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
   1231     HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
   1232   } else if (CompareVT == MVT::i32) {
   1233     HWTrue = DAG.getConstant(-1, DL, CompareVT);
   1234     HWFalse = DAG.getConstant(0, DL, CompareVT);
   1235   }
   1236   else {
   1237     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
   1238   }
   1239 
   1240   // Lower this unsupported SELECT_CC into a combination of two supported
   1241   // SELECT_CC operations.
   1242   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
   1243 
   1244   return DAG.getNode(ISD::SELECT_CC, DL, VT,
   1245       Cond, HWFalse,
   1246       True, False,
   1247       DAG.getCondCode(ISD::SETNE));
   1248 }
   1249 
   1250 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
   1251 /// convert these pointers to a register index.  Each register holds
   1252 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
   1253 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
   1254 /// for indirect addressing.
   1255 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
   1256                                                unsigned StackWidth,
   1257                                                SelectionDAG &DAG) const {
   1258   unsigned SRLPad;
   1259   switch(StackWidth) {
   1260   case 1:
   1261     SRLPad = 2;
   1262     break;
   1263   case 2:
   1264     SRLPad = 3;
   1265     break;
   1266   case 4:
   1267     SRLPad = 4;
   1268     break;
   1269   default: llvm_unreachable("Invalid stack width");
   1270   }
   1271 
   1272   SDLoc DL(Ptr);
   1273   return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
   1274                      DAG.getConstant(SRLPad, DL, MVT::i32));
   1275 }
   1276 
   1277 void R600TargetLowering::getStackAddress(unsigned StackWidth,
   1278                                          unsigned ElemIdx,
   1279                                          unsigned &Channel,
   1280                                          unsigned &PtrIncr) const {
   1281   switch (StackWidth) {
   1282   default:
   1283   case 1:
   1284     Channel = 0;
   1285     if (ElemIdx > 0) {
   1286       PtrIncr = 1;
   1287     } else {
   1288       PtrIncr = 0;
   1289     }
   1290     break;
   1291   case 2:
   1292     Channel = ElemIdx % 2;
   1293     if (ElemIdx == 2) {
   1294       PtrIncr = 1;
   1295     } else {
   1296       PtrIncr = 0;
   1297     }
   1298     break;
   1299   case 4:
   1300     Channel = ElemIdx;
   1301     PtrIncr = 0;
   1302     break;
   1303   }
   1304 }
   1305 
   1306 SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
   1307                                                    SelectionDAG &DAG) const {
   1308   SDLoc DL(Store);
   1309 
   1310   unsigned Mask = 0;
   1311   if (Store->getMemoryVT() == MVT::i8) {
   1312     Mask = 0xff;
   1313   } else if (Store->getMemoryVT() == MVT::i16) {
   1314     Mask = 0xffff;
   1315   }
   1316 
   1317   SDValue Chain = Store->getChain();
   1318   SDValue BasePtr = Store->getBasePtr();
   1319   EVT MemVT = Store->getMemoryVT();
   1320 
   1321   SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
   1322                             DAG.getConstant(2, DL, MVT::i32));
   1323   SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
   1324                             Chain, Ptr,
   1325                             DAG.getTargetConstant(0, DL, MVT::i32));
   1326 
   1327   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
   1328                                 DAG.getConstant(0x3, DL, MVT::i32));
   1329 
   1330   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
   1331                                  DAG.getConstant(3, DL, MVT::i32));
   1332 
   1333   SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
   1334                                   Store->getValue());
   1335 
   1336   SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
   1337 
   1338   SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
   1339                                      MaskedValue, ShiftAmt);
   1340 
   1341   SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32,
   1342                                 DAG.getConstant(Mask, DL, MVT::i32),
   1343                                 ShiftAmt);
   1344   DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
   1345                         DAG.getConstant(0xffffffff, DL, MVT::i32));
   1346   Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
   1347 
   1348   SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
   1349   return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
   1350                      Chain, Value, Ptr,
   1351                      DAG.getTargetConstant(0, DL, MVT::i32));
   1352 }
   1353 
   1354 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   1355   if (SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG))
   1356     return Result;
   1357 
   1358   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
   1359   unsigned AS = StoreNode->getAddressSpace();
   1360   SDValue Value = StoreNode->getValue();
   1361   EVT ValueVT = Value.getValueType();
   1362 
   1363   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
   1364       ValueVT.isVector()) {
   1365     return SplitVectorStore(Op, DAG);
   1366   }
   1367 
   1368   SDLoc DL(Op);
   1369   SDValue Chain = StoreNode->getChain();
   1370   SDValue Ptr = StoreNode->getBasePtr();
   1371 
   1372   if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
   1373     if (StoreNode->isTruncatingStore()) {
   1374       EVT VT = Value.getValueType();
   1375       assert(VT.bitsLE(MVT::i32));
   1376       EVT MemVT = StoreNode->getMemoryVT();
   1377       SDValue MaskConstant;
   1378       if (MemVT == MVT::i8) {
   1379         MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
   1380       } else {
   1381         assert(MemVT == MVT::i16);
   1382         MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
   1383       }
   1384       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
   1385                                       DAG.getConstant(2, DL, MVT::i32));
   1386       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
   1387                                       DAG.getConstant(0x00000003, DL, VT));
   1388       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
   1389       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
   1390                                    DAG.getConstant(3, DL, VT));
   1391       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
   1392       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
   1393       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
   1394       // vector instead.
   1395       SDValue Src[4] = {
   1396         ShiftedValue,
   1397         DAG.getConstant(0, DL, MVT::i32),
   1398         DAG.getConstant(0, DL, MVT::i32),
   1399         Mask
   1400       };
   1401       SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src);
   1402       SDValue Args[3] = { Chain, Input, DWordAddr };
   1403       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
   1404                                      Op->getVTList(), Args, MemVT,
   1405                                      StoreNode->getMemOperand());
   1406     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
   1407                ValueVT.bitsGE(MVT::i32)) {
   1408       // Convert pointer from byte address to dword address.
   1409       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
   1410                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
   1411                                     Ptr, DAG.getConstant(2, DL, MVT::i32)));
   1412 
   1413       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
   1414         llvm_unreachable("Truncated and indexed stores not supported yet");
   1415       } else {
   1416         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
   1417       }
   1418       return Chain;
   1419     }
   1420   }
   1421 
   1422   if (AS != AMDGPUAS::PRIVATE_ADDRESS)
   1423     return SDValue();
   1424 
   1425   EVT MemVT = StoreNode->getMemoryVT();
   1426   if (MemVT.bitsLT(MVT::i32))
   1427     return lowerPrivateTruncStore(StoreNode, DAG);
   1428 
   1429   // Lowering for indirect addressing
   1430   const MachineFunction &MF = DAG.getMachineFunction();
   1431   const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
   1432   unsigned StackWidth = TFL->getStackWidth(MF);
   1433 
   1434   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
   1435 
   1436   if (ValueVT.isVector()) {
   1437     unsigned NumElemVT = ValueVT.getVectorNumElements();
   1438     EVT ElemVT = ValueVT.getVectorElementType();
   1439     SmallVector<SDValue, 4> Stores(NumElemVT);
   1440 
   1441     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
   1442                                       "vector width in load");
   1443 
   1444     for (unsigned i = 0; i < NumElemVT; ++i) {
   1445       unsigned Channel, PtrIncr;
   1446       getStackAddress(StackWidth, i, Channel, PtrIncr);
   1447       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
   1448                         DAG.getConstant(PtrIncr, DL, MVT::i32));
   1449       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
   1450                                  Value, DAG.getConstant(i, DL, MVT::i32));
   1451 
   1452       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
   1453                               Chain, Elem, Ptr,
   1454                               DAG.getTargetConstant(Channel, DL, MVT::i32));
   1455     }
   1456      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
   1457    } else {
   1458     if (ValueVT == MVT::i8) {
   1459       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
   1460     }
   1461     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
   1462     DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
   1463   }
   1464 
   1465   return Chain;
   1466 }
   1467 
   1468 // return (512 + (kc_bank << 12)
   1469 static int
   1470 ConstantAddressBlock(unsigned AddressSpace) {
   1471   switch (AddressSpace) {
   1472   case AMDGPUAS::CONSTANT_BUFFER_0:
   1473     return 512;
   1474   case AMDGPUAS::CONSTANT_BUFFER_1:
   1475     return 512 + 4096;
   1476   case AMDGPUAS::CONSTANT_BUFFER_2:
   1477     return 512 + 4096 * 2;
   1478   case AMDGPUAS::CONSTANT_BUFFER_3:
   1479     return 512 + 4096 * 3;
   1480   case AMDGPUAS::CONSTANT_BUFFER_4:
   1481     return 512 + 4096 * 4;
   1482   case AMDGPUAS::CONSTANT_BUFFER_5:
   1483     return 512 + 4096 * 5;
   1484   case AMDGPUAS::CONSTANT_BUFFER_6:
   1485     return 512 + 4096 * 6;
   1486   case AMDGPUAS::CONSTANT_BUFFER_7:
   1487     return 512 + 4096 * 7;
   1488   case AMDGPUAS::CONSTANT_BUFFER_8:
   1489     return 512 + 4096 * 8;
   1490   case AMDGPUAS::CONSTANT_BUFFER_9:
   1491     return 512 + 4096 * 9;
   1492   case AMDGPUAS::CONSTANT_BUFFER_10:
   1493     return 512 + 4096 * 10;
   1494   case AMDGPUAS::CONSTANT_BUFFER_11:
   1495     return 512 + 4096 * 11;
   1496   case AMDGPUAS::CONSTANT_BUFFER_12:
   1497     return 512 + 4096 * 12;
   1498   case AMDGPUAS::CONSTANT_BUFFER_13:
   1499     return 512 + 4096 * 13;
   1500   case AMDGPUAS::CONSTANT_BUFFER_14:
   1501     return 512 + 4096 * 14;
   1502   case AMDGPUAS::CONSTANT_BUFFER_15:
   1503     return 512 + 4096 * 15;
   1504   default:
   1505     return -1;
   1506   }
   1507 }
   1508 
   1509 SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
   1510                                                 SelectionDAG &DAG) const {
   1511   SDLoc DL(Op);
   1512   LoadSDNode *Load = cast<LoadSDNode>(Op);
   1513   ISD::LoadExtType ExtType = Load->getExtensionType();
   1514   EVT MemVT = Load->getMemoryVT();
   1515 
   1516   // <SI && AS=PRIVATE && EXTLOAD && size < 32bit,
   1517   // register (2-)byte extract.
   1518 
   1519   // Get Register holding the target.
   1520   SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
   1521                             DAG.getConstant(2, DL, MVT::i32));
   1522   // Load the Register.
   1523   SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
   1524                             Load->getChain(),
   1525                             Ptr,
   1526                             DAG.getTargetConstant(0, DL, MVT::i32),
   1527                             Op.getOperand(2));
   1528 
   1529   // Get offset within the register.
   1530   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
   1531                                 Load->getBasePtr(),
   1532                                 DAG.getConstant(0x3, DL, MVT::i32));
   1533 
   1534   // Bit offset of target byte (byteIdx * 8).
   1535   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
   1536                                  DAG.getConstant(3, DL, MVT::i32));
   1537 
   1538   // Shift to the right.
   1539   Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
   1540 
   1541   // Eliminate the upper bits by setting them to ...
   1542   EVT MemEltVT = MemVT.getScalarType();
   1543 
   1544   // ... ones.
   1545   if (ExtType == ISD::SEXTLOAD) {
   1546     SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
   1547 
   1548     SDValue Ops[] = {
   1549       DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
   1550       Load->getChain()
   1551     };
   1552 
   1553     return DAG.getMergeValues(Ops, DL);
   1554   }
   1555 
   1556   // ... or zeros.
   1557   SDValue Ops[] = {
   1558     DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
   1559     Load->getChain()
   1560   };
   1561 
   1562   return DAG.getMergeValues(Ops, DL);
   1563 }
   1564 
   1565 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   1566   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
   1567   unsigned AS = LoadNode->getAddressSpace();
   1568   EVT MemVT = LoadNode->getMemoryVT();
   1569   ISD::LoadExtType ExtType = LoadNode->getExtensionType();
   1570 
   1571   if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
   1572       ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
   1573     return lowerPrivateExtLoad(Op, DAG);
   1574   }
   1575 
   1576   SDLoc DL(Op);
   1577   EVT VT = Op.getValueType();
   1578   SDValue Chain = LoadNode->getChain();
   1579   SDValue Ptr = LoadNode->getBasePtr();
   1580 
   1581   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
   1582     SDValue MergedValues[2] = {
   1583       scalarizeVectorLoad(LoadNode, DAG),
   1584       Chain
   1585     };
   1586     return DAG.getMergeValues(MergedValues, DL);
   1587   }
   1588 
   1589   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
   1590   if (ConstantBlock > -1 &&
   1591       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
   1592        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
   1593     SDValue Result;
   1594     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
   1595         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
   1596         isa<ConstantSDNode>(Ptr)) {
   1597       SDValue Slots[4];
   1598       for (unsigned i = 0; i < 4; i++) {
   1599         // We want Const position encoded with the following formula :
   1600         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
   1601         // const_index is Ptr computed by llvm using an alignment of 16.
   1602         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
   1603         // then div by 4 at the ISel step
   1604         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
   1605             DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
   1606         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
   1607       }
   1608       EVT NewVT = MVT::v4i32;
   1609       unsigned NumElements = 4;
   1610       if (VT.isVector()) {
   1611         NewVT = VT;
   1612         NumElements = VT.getVectorNumElements();
   1613       }
   1614       Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
   1615     } else {
   1616       // non-constant ptr can't be folded, keeps it as a v4f32 load
   1617       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
   1618           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
   1619                       DAG.getConstant(4, DL, MVT::i32)),
   1620                       DAG.getConstant(LoadNode->getAddressSpace() -
   1621                                       AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
   1622           );
   1623     }
   1624 
   1625     if (!VT.isVector()) {
   1626       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
   1627                            DAG.getConstant(0, DL, MVT::i32));
   1628     }
   1629 
   1630     SDValue MergedValues[2] = {
   1631       Result,
   1632       Chain
   1633     };
   1634     return DAG.getMergeValues(MergedValues, DL);
   1635   }
   1636 
   1637   SDValue LoweredLoad;
   1638 
   1639   // For most operations returning SDValue() will result in the node being
   1640   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
   1641   // need to manually expand loads that may be legal in some address spaces and
   1642   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
   1643   // compute shaders, since the data is sign extended when it is uploaded to the
   1644   // buffer. However SEXT loads from other address spaces are not supported, so
   1645   // we need to expand them here.
   1646   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
   1647     EVT MemVT = LoadNode->getMemoryVT();
   1648     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
   1649     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
   1650                                   LoadNode->getPointerInfo(), MemVT,
   1651                                   LoadNode->isVolatile(),
   1652                                   LoadNode->isNonTemporal(),
   1653                                   LoadNode->isInvariant(),
   1654                                   LoadNode->getAlignment());
   1655     SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
   1656                               DAG.getValueType(MemVT));
   1657 
   1658     SDValue MergedValues[2] = { Res, Chain };
   1659     return DAG.getMergeValues(MergedValues, DL);
   1660   }
   1661 
   1662   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
   1663     return SDValue();
   1664   }
   1665 
   1666   // Lowering for indirect addressing
   1667   const MachineFunction &MF = DAG.getMachineFunction();
   1668   const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
   1669   unsigned StackWidth = TFL->getStackWidth(MF);
   1670 
   1671   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
   1672 
   1673   if (VT.isVector()) {
   1674     unsigned NumElemVT = VT.getVectorNumElements();
   1675     EVT ElemVT = VT.getVectorElementType();
   1676     SDValue Loads[4];
   1677 
   1678     assert(NumElemVT <= 4);
   1679     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
   1680                                       "vector width in load");
   1681 
   1682     for (unsigned i = 0; i < NumElemVT; ++i) {
   1683       unsigned Channel, PtrIncr;
   1684       getStackAddress(StackWidth, i, Channel, PtrIncr);
   1685       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
   1686                         DAG.getConstant(PtrIncr, DL, MVT::i32));
   1687       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
   1688                              Chain, Ptr,
   1689                              DAG.getTargetConstant(Channel, DL, MVT::i32),
   1690                              Op.getOperand(2));
   1691     }
   1692     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT);
   1693     LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT));
   1694   } else {
   1695     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
   1696                               Chain, Ptr,
   1697                               DAG.getTargetConstant(0, DL, MVT::i32), // Channel
   1698                               Op.getOperand(2));
   1699   }
   1700 
   1701   SDValue Ops[2] = {
   1702     LoweredLoad,
   1703     Chain
   1704   };
   1705 
   1706   return DAG.getMergeValues(Ops, DL);
   1707 }
   1708 
   1709 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   1710   SDValue Chain = Op.getOperand(0);
   1711   SDValue Cond  = Op.getOperand(1);
   1712   SDValue Jump  = Op.getOperand(2);
   1713 
   1714   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
   1715                      Chain, Jump, Cond);
   1716 }
   1717 
   1718 SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
   1719                                             SelectionDAG &DAG) const {
   1720   MachineFunction &MF = DAG.getMachineFunction();
   1721   const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
   1722 
   1723   FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
   1724 
   1725   unsigned FrameIndex = FIN->getIndex();
   1726   unsigned IgnoredFrameReg;
   1727   unsigned Offset =
   1728     TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
   1729   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
   1730                          Op.getValueType());
   1731 }
   1732 
   1733 /// XXX Only kernel functions are supported, so we can assume for now that
   1734 /// every function is a kernel function, but in the future we should use
   1735 /// separate calling conventions for kernel and non-kernel functions.
   1736 SDValue R600TargetLowering::LowerFormalArguments(
   1737     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
   1738     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
   1739     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   1740   SmallVector<CCValAssign, 16> ArgLocs;
   1741   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
   1742                  *DAG.getContext());
   1743   MachineFunction &MF = DAG.getMachineFunction();
   1744   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
   1745 
   1746   SmallVector<ISD::InputArg, 8> LocalIns;
   1747 
   1748   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
   1749 
   1750   AnalyzeFormalArguments(CCInfo, LocalIns);
   1751 
   1752   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
   1753     CCValAssign &VA = ArgLocs[i];
   1754     const ISD::InputArg &In = Ins[i];
   1755     EVT VT = In.VT;
   1756     EVT MemVT = VA.getLocVT();
   1757     if (!VT.isVector() && MemVT.isVector()) {
   1758       // Get load source type if scalarized.
   1759       MemVT = MemVT.getVectorElementType();
   1760     }
   1761 
   1762     if (AMDGPU::isShader(CallConv)) {
   1763       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
   1764       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
   1765       InVals.push_back(Register);
   1766       continue;
   1767     }
   1768 
   1769     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
   1770                                           AMDGPUAS::CONSTANT_BUFFER_0);
   1771 
   1772     // i64 isn't a legal type, so the register type used ends up as i32, which
   1773     // isn't expected here. It attempts to create this sextload, but it ends up
   1774     // being invalid. Somehow this seems to work with i64 arguments, but breaks
   1775     // for <1 x i64>.
   1776 
   1777     // The first 36 bytes of the input buffer contains information about
   1778     // thread group and global sizes.
   1779     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
   1780     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
   1781       // FIXME: This should really check the extload type, but the handling of
   1782       // extload vector parameters seems to be broken.
   1783 
   1784       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
   1785       Ext = ISD::SEXTLOAD;
   1786     }
   1787 
   1788     // Compute the offset from the value.
   1789     // XXX - I think PartOffset should give you this, but it seems to give the
   1790     // size of the register which isn't useful.
   1791 
   1792     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
   1793     unsigned PartOffset = VA.getLocMemOffset();
   1794     unsigned Offset = 36 + VA.getLocMemOffset();
   1795 
   1796     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
   1797     SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
   1798                               DAG.getConstant(Offset, DL, MVT::i32),
   1799                               DAG.getUNDEF(MVT::i32),
   1800                               PtrInfo,
   1801                               MemVT, false, true, true, 4);
   1802 
   1803     // 4 is the preferred alignment for the CONSTANT memory space.
   1804     InVals.push_back(Arg);
   1805     MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
   1806   }
   1807   return Chain;
   1808 }
   1809 
   1810 EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
   1811                                            EVT VT) const {
   1812    if (!VT.isVector())
   1813      return MVT::i32;
   1814    return VT.changeVectorElementTypeToInteger();
   1815 }
   1816 
   1817 bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   1818                                                         unsigned AddrSpace,
   1819                                                         unsigned Align,
   1820                                                         bool *IsFast) const {
   1821   if (IsFast)
   1822     *IsFast = false;
   1823 
   1824   if (!VT.isSimple() || VT == MVT::Other)
   1825     return false;
   1826 
   1827   if (VT.bitsLT(MVT::i32))
   1828     return false;
   1829 
   1830   // TODO: This is a rough estimate.
   1831   if (IsFast)
   1832     *IsFast = true;
   1833 
   1834   return VT.bitsGT(MVT::i32) && Align % 4 == 0;
   1835 }
   1836 
   1837 static SDValue CompactSwizzlableVector(
   1838   SelectionDAG &DAG, SDValue VectorEntry,
   1839   DenseMap<unsigned, unsigned> &RemapSwizzle) {
   1840   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
   1841   assert(RemapSwizzle.empty());
   1842   SDValue NewBldVec[4] = {
   1843     VectorEntry.getOperand(0),
   1844     VectorEntry.getOperand(1),
   1845     VectorEntry.getOperand(2),
   1846     VectorEntry.getOperand(3)
   1847   };
   1848 
   1849   for (unsigned i = 0; i < 4; i++) {
   1850     if (NewBldVec[i].isUndef())
   1851       // We mask write here to teach later passes that the ith element of this
   1852       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
   1853       // break false dependencies and additionnaly make assembly easier to read.
   1854       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
   1855     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
   1856       if (C->isZero()) {
   1857         RemapSwizzle[i] = 4; // SEL_0
   1858         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
   1859       } else if (C->isExactlyValue(1.0)) {
   1860         RemapSwizzle[i] = 5; // SEL_1
   1861         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
   1862       }
   1863     }
   1864 
   1865     if (NewBldVec[i].isUndef())
   1866       continue;
   1867     for (unsigned j = 0; j < i; j++) {
   1868       if (NewBldVec[i] == NewBldVec[j]) {
   1869         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
   1870         RemapSwizzle[i] = j;
   1871         break;
   1872       }
   1873     }
   1874   }
   1875 
   1876   return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
   1877                             NewBldVec);
   1878 }
   1879 
   1880 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
   1881                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
   1882   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
   1883   assert(RemapSwizzle.empty());
   1884   SDValue NewBldVec[4] = {
   1885       VectorEntry.getOperand(0),
   1886       VectorEntry.getOperand(1),
   1887       VectorEntry.getOperand(2),
   1888       VectorEntry.getOperand(3)
   1889   };
   1890   bool isUnmovable[4] = { false, false, false, false };
   1891   for (unsigned i = 0; i < 4; i++) {
   1892     RemapSwizzle[i] = i;
   1893     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
   1894       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
   1895           ->getZExtValue();
   1896       if (i == Idx)
   1897         isUnmovable[Idx] = true;
   1898     }
   1899   }
   1900 
   1901   for (unsigned i = 0; i < 4; i++) {
   1902     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
   1903       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
   1904           ->getZExtValue();
   1905       if (isUnmovable[Idx])
   1906         continue;
   1907       // Swap i and Idx
   1908       std::swap(NewBldVec[Idx], NewBldVec[i]);
   1909       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
   1910       break;
   1911     }
   1912   }
   1913 
   1914   return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
   1915                             NewBldVec);
   1916 }
   1917 
   1918 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
   1919                                             SelectionDAG &DAG,
   1920                                             const SDLoc &DL) const {
   1921   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
   1922   // Old -> New swizzle values
   1923   DenseMap<unsigned, unsigned> SwizzleRemap;
   1924 
   1925   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
   1926   for (unsigned i = 0; i < 4; i++) {
   1927     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
   1928     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
   1929       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
   1930   }
   1931 
   1932   SwizzleRemap.clear();
   1933   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
   1934   for (unsigned i = 0; i < 4; i++) {
   1935     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
   1936     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
   1937       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
   1938   }
   1939 
   1940   return BuildVector;
   1941 }
   1942 
   1943 
   1944 //===----------------------------------------------------------------------===//
   1945 // Custom DAG Optimizations
   1946 //===----------------------------------------------------------------------===//
   1947 
   1948 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
   1949                                               DAGCombinerInfo &DCI) const {
   1950   SelectionDAG &DAG = DCI.DAG;
   1951 
   1952   switch (N->getOpcode()) {
   1953   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
   1954   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
   1955   case ISD::FP_ROUND: {
   1956       SDValue Arg = N->getOperand(0);
   1957       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
   1958         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
   1959                            Arg.getOperand(0));
   1960       }
   1961       break;
   1962     }
   1963 
   1964   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
   1965   // (i32 select_cc f32, f32, -1, 0 cc)
   1966   //
   1967   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
   1968   // this to one of the SET*_DX10 instructions.
   1969   case ISD::FP_TO_SINT: {
   1970     SDValue FNeg = N->getOperand(0);
   1971     if (FNeg.getOpcode() != ISD::FNEG) {
   1972       return SDValue();
   1973     }
   1974     SDValue SelectCC = FNeg.getOperand(0);
   1975     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
   1976         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
   1977         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
   1978         !isHWTrueValue(SelectCC.getOperand(2)) ||
   1979         !isHWFalseValue(SelectCC.getOperand(3))) {
   1980       return SDValue();
   1981     }
   1982 
   1983     SDLoc dl(N);
   1984     return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0),
   1985                            SelectCC.getOperand(0), // LHS
   1986                            SelectCC.getOperand(1), // RHS
   1987                            DAG.getConstant(-1, dl, MVT::i32), // True
   1988                            DAG.getConstant(0, dl, MVT::i32),  // False
   1989                            SelectCC.getOperand(4)); // CC
   1990 
   1991     break;
   1992   }
   1993 
   1994   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
   1995   // => build_vector elt0, ... , NewEltIdx, ... , eltN
   1996   case ISD::INSERT_VECTOR_ELT: {
   1997     SDValue InVec = N->getOperand(0);
   1998     SDValue InVal = N->getOperand(1);
   1999     SDValue EltNo = N->getOperand(2);
   2000     SDLoc dl(N);
   2001 
   2002     // If the inserted element is an UNDEF, just use the input vector.
   2003     if (InVal.isUndef())
   2004       return InVec;
   2005 
   2006     EVT VT = InVec.getValueType();
   2007 
   2008     // If we can't generate a legal BUILD_VECTOR, exit
   2009     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
   2010       return SDValue();
   2011 
   2012     // Check that we know which element is being inserted
   2013     if (!isa<ConstantSDNode>(EltNo))
   2014       return SDValue();
   2015     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
   2016 
   2017     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
   2018     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
   2019     // vector elements.
   2020     SmallVector<SDValue, 8> Ops;
   2021     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
   2022       Ops.append(InVec.getNode()->op_begin(),
   2023                  InVec.getNode()->op_end());
   2024     } else if (InVec.isUndef()) {
   2025       unsigned NElts = VT.getVectorNumElements();
   2026       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
   2027     } else {
   2028       return SDValue();
   2029     }
   2030 
   2031     // Insert the element
   2032     if (Elt < Ops.size()) {
   2033       // All the operands of BUILD_VECTOR must have the same type;
   2034       // we enforce that here.
   2035       EVT OpVT = Ops[0].getValueType();
   2036       if (InVal.getValueType() != OpVT)
   2037         InVal = OpVT.bitsGT(InVal.getValueType()) ?
   2038           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
   2039           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
   2040       Ops[Elt] = InVal;
   2041     }
   2042 
   2043     // Return the new vector
   2044     return DAG.getBuildVector(VT, dl, Ops);
   2045   }
   2046 
   2047   // Extract_vec (Build_vector) generated by custom lowering
   2048   // also needs to be customly combined
   2049   case ISD::EXTRACT_VECTOR_ELT: {
   2050     SDValue Arg = N->getOperand(0);
   2051     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
   2052       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
   2053         unsigned Element = Const->getZExtValue();
   2054         return Arg->getOperand(Element);
   2055       }
   2056     }
   2057     if (Arg.getOpcode() == ISD::BITCAST &&
   2058         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
   2059       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
   2060         unsigned Element = Const->getZExtValue();
   2061         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
   2062             Arg->getOperand(0).getOperand(Element));
   2063       }
   2064     }
   2065     break;
   2066   }
   2067 
   2068   case ISD::SELECT_CC: {
   2069     // Try common optimizations
   2070     if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI))
   2071       return Ret;
   2072 
   2073     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
   2074     //      selectcc x, y, a, b, inv(cc)
   2075     //
   2076     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
   2077     //      selectcc x, y, a, b, cc
   2078     SDValue LHS = N->getOperand(0);
   2079     if (LHS.getOpcode() != ISD::SELECT_CC) {
   2080       return SDValue();
   2081     }
   2082 
   2083     SDValue RHS = N->getOperand(1);
   2084     SDValue True = N->getOperand(2);
   2085     SDValue False = N->getOperand(3);
   2086     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
   2087 
   2088     if (LHS.getOperand(2).getNode() != True.getNode() ||
   2089         LHS.getOperand(3).getNode() != False.getNode() ||
   2090         RHS.getNode() != False.getNode()) {
   2091       return SDValue();
   2092     }
   2093 
   2094     switch (NCC) {
   2095     default: return SDValue();
   2096     case ISD::SETNE: return LHS;
   2097     case ISD::SETEQ: {
   2098       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
   2099       LHSCC = ISD::getSetCCInverse(LHSCC,
   2100                                   LHS.getOperand(0).getValueType().isInteger());
   2101       if (DCI.isBeforeLegalizeOps() ||
   2102           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
   2103         return DAG.getSelectCC(SDLoc(N),
   2104                                LHS.getOperand(0),
   2105                                LHS.getOperand(1),
   2106                                LHS.getOperand(2),
   2107                                LHS.getOperand(3),
   2108                                LHSCC);
   2109       break;
   2110     }
   2111     }
   2112     return SDValue();
   2113   }
   2114 
   2115   case AMDGPUISD::EXPORT: {
   2116     SDValue Arg = N->getOperand(1);
   2117     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
   2118       break;
   2119 
   2120     SDValue NewArgs[8] = {
   2121       N->getOperand(0), // Chain
   2122       SDValue(),
   2123       N->getOperand(2), // ArrayBase
   2124       N->getOperand(3), // Type
   2125       N->getOperand(4), // SWZ_X
   2126       N->getOperand(5), // SWZ_Y
   2127       N->getOperand(6), // SWZ_Z
   2128       N->getOperand(7) // SWZ_W
   2129     };
   2130     SDLoc DL(N);
   2131     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
   2132     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
   2133   }
   2134   case AMDGPUISD::TEXTURE_FETCH: {
   2135     SDValue Arg = N->getOperand(1);
   2136     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
   2137       break;
   2138 
   2139     SDValue NewArgs[19] = {
   2140       N->getOperand(0),
   2141       N->getOperand(1),
   2142       N->getOperand(2),
   2143       N->getOperand(3),
   2144       N->getOperand(4),
   2145       N->getOperand(5),
   2146       N->getOperand(6),
   2147       N->getOperand(7),
   2148       N->getOperand(8),
   2149       N->getOperand(9),
   2150       N->getOperand(10),
   2151       N->getOperand(11),
   2152       N->getOperand(12),
   2153       N->getOperand(13),
   2154       N->getOperand(14),
   2155       N->getOperand(15),
   2156       N->getOperand(16),
   2157       N->getOperand(17),
   2158       N->getOperand(18),
   2159     };
   2160     SDLoc DL(N);
   2161     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
   2162     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
   2163   }
   2164   }
   2165 
   2166   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
   2167 }
   2168 
   2169 bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
   2170                                      SDValue &Src, SDValue &Neg, SDValue &Abs,
   2171                                      SDValue &Sel, SDValue &Imm,
   2172                                      SelectionDAG &DAG) const {
   2173   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
   2174   if (!Src.isMachineOpcode())
   2175     return false;
   2176 
   2177   switch (Src.getMachineOpcode()) {
   2178   case AMDGPU::FNEG_R600:
   2179     if (!Neg.getNode())
   2180       return false;
   2181     Src = Src.getOperand(0);
   2182     Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
   2183     return true;
   2184   case AMDGPU::FABS_R600:
   2185     if (!Abs.getNode())
   2186       return false;
   2187     Src = Src.getOperand(0);
   2188     Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
   2189     return true;
   2190   case AMDGPU::CONST_COPY: {
   2191     unsigned Opcode = ParentNode->getMachineOpcode();
   2192     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
   2193 
   2194     if (!Sel.getNode())
   2195       return false;
   2196 
   2197     SDValue CstOffset = Src.getOperand(0);
   2198     if (ParentNode->getValueType(0).isVector())
   2199       return false;
   2200 
   2201     // Gather constants values
   2202     int SrcIndices[] = {
   2203       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
   2204       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
   2205       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
   2206       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
   2207       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
   2208       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
   2209       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
   2210       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
   2211       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
   2212       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
   2213       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
   2214     };
   2215     std::vector<unsigned> Consts;
   2216     for (int OtherSrcIdx : SrcIndices) {
   2217       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
   2218       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
   2219         continue;
   2220       if (HasDst) {
   2221         OtherSrcIdx--;
   2222         OtherSelIdx--;
   2223       }
   2224       if (RegisterSDNode *Reg =
   2225           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
   2226         if (Reg->getReg() == AMDGPU::ALU_CONST) {
   2227           ConstantSDNode *Cst
   2228             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
   2229           Consts.push_back(Cst->getZExtValue());
   2230         }
   2231       }
   2232     }
   2233 
   2234     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
   2235     Consts.push_back(Cst->getZExtValue());
   2236     if (!TII->fitsConstReadLimitations(Consts)) {
   2237       return false;
   2238     }
   2239 
   2240     Sel = CstOffset;
   2241     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
   2242     return true;
   2243   }
   2244   case AMDGPU::MOV_IMM_GLOBAL_ADDR:
   2245     // Check if the Imm slot is used. Taken from below.
   2246     if (cast<ConstantSDNode>(Imm)->getZExtValue())
   2247       return false;
   2248     Imm = Src.getOperand(0);
   2249     Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32);
   2250     return true;
   2251   case AMDGPU::MOV_IMM_I32:
   2252   case AMDGPU::MOV_IMM_F32: {
   2253     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
   2254     uint64_t ImmValue = 0;
   2255 
   2256 
   2257     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
   2258       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
   2259       float FloatValue = FPC->getValueAPF().convertToFloat();
   2260       if (FloatValue == 0.0) {
   2261         ImmReg = AMDGPU::ZERO;
   2262       } else if (FloatValue == 0.5) {
   2263         ImmReg = AMDGPU::HALF;
   2264       } else if (FloatValue == 1.0) {
   2265         ImmReg = AMDGPU::ONE;
   2266       } else {
   2267         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
   2268       }
   2269     } else {
   2270       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
   2271       uint64_t Value = C->getZExtValue();
   2272       if (Value == 0) {
   2273         ImmReg = AMDGPU::ZERO;
   2274       } else if (Value == 1) {
   2275         ImmReg = AMDGPU::ONE_INT;
   2276       } else {
   2277         ImmValue = Value;
   2278       }
   2279     }
   2280 
   2281     // Check that we aren't already using an immediate.
   2282     // XXX: It's possible for an instruction to have more than one
   2283     // immediate operand, but this is not supported yet.
   2284     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
   2285       if (!Imm.getNode())
   2286         return false;
   2287       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
   2288       assert(C);
   2289       if (C->getZExtValue())
   2290         return false;
   2291       Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
   2292     }
   2293     Src = DAG.getRegister(ImmReg, MVT::i32);
   2294     return true;
   2295   }
   2296   default:
   2297     return false;
   2298   }
   2299 }
   2300 
   2301 /// \brief Fold the instructions after selecting them
   2302 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
   2303                                             SelectionDAG &DAG) const {
   2304   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
   2305   if (!Node->isMachineOpcode())
   2306     return Node;
   2307 
   2308   unsigned Opcode = Node->getMachineOpcode();
   2309   SDValue FakeOp;
   2310 
   2311   std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
   2312 
   2313   if (Opcode == AMDGPU::DOT_4) {
   2314     int OperandIdx[] = {
   2315       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
   2316       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
   2317       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
   2318       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
   2319       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
   2320       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
   2321       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
   2322       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
   2323         };
   2324     int NegIdx[] = {
   2325       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
   2326       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
   2327       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
   2328       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
   2329       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
   2330       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
   2331       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
   2332       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
   2333     };
   2334     int AbsIdx[] = {
   2335       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
   2336       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
   2337       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
   2338       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
   2339       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
   2340       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
   2341       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
   2342       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
   2343     };
   2344     for (unsigned i = 0; i < 8; i++) {
   2345       if (OperandIdx[i] < 0)
   2346         return Node;
   2347       SDValue &Src = Ops[OperandIdx[i] - 1];
   2348       SDValue &Neg = Ops[NegIdx[i] - 1];
   2349       SDValue &Abs = Ops[AbsIdx[i] - 1];
   2350       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
   2351       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
   2352       if (HasDst)
   2353         SelIdx--;
   2354       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
   2355       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
   2356         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
   2357     }
   2358   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
   2359     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
   2360       SDValue &Src = Ops[i];
   2361       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
   2362         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
   2363     }
   2364   } else if (Opcode == AMDGPU::CLAMP_R600) {
   2365     SDValue Src = Node->getOperand(0);
   2366     if (!Src.isMachineOpcode() ||
   2367         !TII->hasInstrModifiers(Src.getMachineOpcode()))
   2368       return Node;
   2369     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
   2370         AMDGPU::OpName::clamp);
   2371     if (ClampIdx < 0)
   2372       return Node;
   2373     SDLoc DL(Node);
   2374     std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
   2375     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
   2376     return DAG.getMachineNode(Src.getMachineOpcode(), DL,
   2377                               Node->getVTList(), Ops);
   2378   } else {
   2379     if (!TII->hasInstrModifiers(Opcode))
   2380       return Node;
   2381     int OperandIdx[] = {
   2382       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
   2383       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
   2384       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
   2385     };
   2386     int NegIdx[] = {
   2387       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
   2388       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
   2389       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
   2390     };
   2391     int AbsIdx[] = {
   2392       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
   2393       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
   2394       -1
   2395     };
   2396     for (unsigned i = 0; i < 3; i++) {
   2397       if (OperandIdx[i] < 0)
   2398         return Node;
   2399       SDValue &Src = Ops[OperandIdx[i] - 1];
   2400       SDValue &Neg = Ops[NegIdx[i] - 1];
   2401       SDValue FakeAbs;
   2402       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
   2403       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
   2404       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
   2405       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
   2406       if (HasDst) {
   2407         SelIdx--;
   2408         ImmIdx--;
   2409       }
   2410       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
   2411       SDValue &Imm = Ops[ImmIdx];
   2412       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
   2413         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
   2414     }
   2415   }
   2416 
   2417   return Node;
   2418 }
   2419