Home | History | Annotate | Download | only in R600
      1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 /// \brief Custom DAG lowering for R600
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "R600ISelLowering.h"
     16 #include "R600Defines.h"
     17 #include "R600InstrInfo.h"
     18 #include "R600MachineFunctionInfo.h"
     19 #include "llvm/CodeGen/CallingConvLower.h"
     20 #include "llvm/CodeGen/MachineFrameInfo.h"
     21 #include "llvm/CodeGen/MachineInstrBuilder.h"
     22 #include "llvm/CodeGen/MachineRegisterInfo.h"
     23 #include "llvm/CodeGen/SelectionDAG.h"
     24 #include "llvm/IR/Argument.h"
     25 #include "llvm/IR/Function.h"
     26 
     27 using namespace llvm;
     28 
     29 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
     30     AMDGPUTargetLowering(TM),
     31     Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
     32   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
     33   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
     34   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
     35   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
     36   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
     37   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
     38 
     39   computeRegisterProperties();
     40 
     41   setOperationAction(ISD::FADD, MVT::v4f32, Expand);
     42   setOperationAction(ISD::FADD, MVT::v2f32, Expand);
     43   setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
     44   setOperationAction(ISD::FMUL, MVT::v2f32, Expand);
     45   setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
     46   setOperationAction(ISD::FDIV, MVT::v2f32, Expand);
     47   setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
     48   setOperationAction(ISD::FSUB, MVT::v2f32, Expand);
     49 
     50   setOperationAction(ISD::FCOS, MVT::f32, Custom);
     51   setOperationAction(ISD::FSIN, MVT::f32, Custom);
     52 
     53   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
     54   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
     55 
     56   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
     57   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
     58 
     59   setOperationAction(ISD::FSUB, MVT::f32, Expand);
     60 
     61   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
     62   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
     63   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
     64 
     65   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
     66   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
     67 
     68   setOperationAction(ISD::SETCC, MVT::i32, Expand);
     69   setOperationAction(ISD::SETCC, MVT::f32, Expand);
     70   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
     71 
     72   setOperationAction(ISD::SELECT, MVT::i32, Custom);
     73   setOperationAction(ISD::SELECT, MVT::f32, Custom);
     74 
     75   // Legalize loads and stores to the private address space.
     76   setOperationAction(ISD::LOAD, MVT::i32, Custom);
     77   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
     78   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
     79   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
     80   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
     81   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
     82   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
     83   setOperationAction(ISD::STORE, MVT::i8, Custom);
     84   setOperationAction(ISD::STORE, MVT::i32, Custom);
     85   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
     86   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
     87 
     88   setOperationAction(ISD::LOAD, MVT::i32, Custom);
     89   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
     90   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
     91 
     92   setTargetDAGCombine(ISD::FP_ROUND);
     93   setTargetDAGCombine(ISD::FP_TO_SINT);
     94   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
     95   setTargetDAGCombine(ISD::SELECT_CC);
     96   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
     97 
     98   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
     99 
    100   setBooleanContents(ZeroOrNegativeOneBooleanContent);
    101   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
    102   setSchedulingPreference(Sched::VLIW);
    103 }
    104 
    105 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
    106     MachineInstr * MI, MachineBasicBlock * BB) const {
    107   MachineFunction * MF = BB->getParent();
    108   MachineRegisterInfo &MRI = MF->getRegInfo();
    109   MachineBasicBlock::iterator I = *MI;
    110   const R600InstrInfo *TII =
    111     static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
    112 
    113   switch (MI->getOpcode()) {
    114   default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
    115   case AMDGPU::CLAMP_R600: {
    116     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
    117                                                    AMDGPU::MOV,
    118                                                    MI->getOperand(0).getReg(),
    119                                                    MI->getOperand(1).getReg());
    120     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
    121     break;
    122   }
    123 
    124   case AMDGPU::FABS_R600: {
    125     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
    126                                                     AMDGPU::MOV,
    127                                                     MI->getOperand(0).getReg(),
    128                                                     MI->getOperand(1).getReg());
    129     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
    130     break;
    131   }
    132 
    133   case AMDGPU::FNEG_R600: {
    134     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
    135                                                     AMDGPU::MOV,
    136                                                     MI->getOperand(0).getReg(),
    137                                                     MI->getOperand(1).getReg());
    138     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
    139     break;
    140   }
    141 
    142   case AMDGPU::MASK_WRITE: {
    143     unsigned maskedRegister = MI->getOperand(0).getReg();
    144     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
    145     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
    146     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
    147     break;
    148   }
    149 
    150   case AMDGPU::LDS_READ_RET: {
    151     MachineInstrBuilder NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
    152                                         TII->get(MI->getOpcode()),
    153                                         AMDGPU::OQAP);
    154     for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
    155       NewMI.addOperand(MI->getOperand(i));
    156     }
    157     TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV,
    158                                  MI->getOperand(0).getReg(),
    159                                  AMDGPU::OQAP);
    160     break;
    161   }
    162 
    163   case AMDGPU::MOV_IMM_F32:
    164     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
    165                      MI->getOperand(1).getFPImm()->getValueAPF()
    166                          .bitcastToAPInt().getZExtValue());
    167     break;
    168   case AMDGPU::MOV_IMM_I32:
    169     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
    170                      MI->getOperand(1).getImm());
    171     break;
    172   case AMDGPU::CONST_COPY: {
    173     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
    174         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
    175     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
    176         MI->getOperand(1).getImm());
    177     break;
    178   }
    179 
    180   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
    181   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
    182   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
    183     unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
    184 
    185     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
    186             .addOperand(MI->getOperand(0))
    187             .addOperand(MI->getOperand(1))
    188             .addImm(EOP); // Set End of program bit
    189     break;
    190   }
    191 
    192   case AMDGPU::TXD: {
    193     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
    194     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
    195     MachineOperand &RID = MI->getOperand(4);
    196     MachineOperand &SID = MI->getOperand(5);
    197     unsigned TextureId = MI->getOperand(6).getImm();
    198     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
    199     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
    200 
    201     switch (TextureId) {
    202     case 5: // Rect
    203       CTX = CTY = 0;
    204       break;
    205     case 6: // Shadow1D
    206       SrcW = SrcZ;
    207       break;
    208     case 7: // Shadow2D
    209       SrcW = SrcZ;
    210       break;
    211     case 8: // ShadowRect
    212       CTX = CTY = 0;
    213       SrcW = SrcZ;
    214       break;
    215     case 9: // 1DArray
    216       SrcZ = SrcY;
    217       CTZ = 0;
    218       break;
    219     case 10: // 2DArray
    220       CTZ = 0;
    221       break;
    222     case 11: // Shadow1DArray
    223       SrcZ = SrcY;
    224       CTZ = 0;
    225       break;
    226     case 12: // Shadow2DArray
    227       CTZ = 0;
    228       break;
    229     }
    230     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
    231             .addOperand(MI->getOperand(3))
    232             .addImm(SrcX)
    233             .addImm(SrcY)
    234             .addImm(SrcZ)
    235             .addImm(SrcW)
    236             .addImm(0)
    237             .addImm(0)
    238             .addImm(0)
    239             .addImm(0)
    240             .addImm(1)
    241             .addImm(2)
    242             .addImm(3)
    243             .addOperand(RID)
    244             .addOperand(SID)
    245             .addImm(CTX)
    246             .addImm(CTY)
    247             .addImm(CTZ)
    248             .addImm(CTW);
    249     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
    250             .addOperand(MI->getOperand(2))
    251             .addImm(SrcX)
    252             .addImm(SrcY)
    253             .addImm(SrcZ)
    254             .addImm(SrcW)
    255             .addImm(0)
    256             .addImm(0)
    257             .addImm(0)
    258             .addImm(0)
    259             .addImm(1)
    260             .addImm(2)
    261             .addImm(3)
    262             .addOperand(RID)
    263             .addOperand(SID)
    264             .addImm(CTX)
    265             .addImm(CTY)
    266             .addImm(CTZ)
    267             .addImm(CTW);
    268     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
    269             .addOperand(MI->getOperand(0))
    270             .addOperand(MI->getOperand(1))
    271             .addImm(SrcX)
    272             .addImm(SrcY)
    273             .addImm(SrcZ)
    274             .addImm(SrcW)
    275             .addImm(0)
    276             .addImm(0)
    277             .addImm(0)
    278             .addImm(0)
    279             .addImm(1)
    280             .addImm(2)
    281             .addImm(3)
    282             .addOperand(RID)
    283             .addOperand(SID)
    284             .addImm(CTX)
    285             .addImm(CTY)
    286             .addImm(CTZ)
    287             .addImm(CTW)
    288             .addReg(T0, RegState::Implicit)
    289             .addReg(T1, RegState::Implicit);
    290     break;
    291   }
    292 
    293   case AMDGPU::TXD_SHADOW: {
    294     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
    295     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
    296     MachineOperand &RID = MI->getOperand(4);
    297     MachineOperand &SID = MI->getOperand(5);
    298     unsigned TextureId = MI->getOperand(6).getImm();
    299     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
    300     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
    301 
    302     switch (TextureId) {
    303     case 5: // Rect
    304       CTX = CTY = 0;
    305       break;
    306     case 6: // Shadow1D
    307       SrcW = SrcZ;
    308       break;
    309     case 7: // Shadow2D
    310       SrcW = SrcZ;
    311       break;
    312     case 8: // ShadowRect
    313       CTX = CTY = 0;
    314       SrcW = SrcZ;
    315       break;
    316     case 9: // 1DArray
    317       SrcZ = SrcY;
    318       CTZ = 0;
    319       break;
    320     case 10: // 2DArray
    321       CTZ = 0;
    322       break;
    323     case 11: // Shadow1DArray
    324       SrcZ = SrcY;
    325       CTZ = 0;
    326       break;
    327     case 12: // Shadow2DArray
    328       CTZ = 0;
    329       break;
    330     }
    331 
    332     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
    333             .addOperand(MI->getOperand(3))
    334             .addImm(SrcX)
    335             .addImm(SrcY)
    336             .addImm(SrcZ)
    337             .addImm(SrcW)
    338             .addImm(0)
    339             .addImm(0)
    340             .addImm(0)
    341             .addImm(0)
    342             .addImm(1)
    343             .addImm(2)
    344             .addImm(3)
    345             .addOperand(RID)
    346             .addOperand(SID)
    347             .addImm(CTX)
    348             .addImm(CTY)
    349             .addImm(CTZ)
    350             .addImm(CTW);
    351     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
    352             .addOperand(MI->getOperand(2))
    353             .addImm(SrcX)
    354             .addImm(SrcY)
    355             .addImm(SrcZ)
    356             .addImm(SrcW)
    357             .addImm(0)
    358             .addImm(0)
    359             .addImm(0)
    360             .addImm(0)
    361             .addImm(1)
    362             .addImm(2)
    363             .addImm(3)
    364             .addOperand(RID)
    365             .addOperand(SID)
    366             .addImm(CTX)
    367             .addImm(CTY)
    368             .addImm(CTZ)
    369             .addImm(CTW);
    370     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
    371             .addOperand(MI->getOperand(0))
    372             .addOperand(MI->getOperand(1))
    373             .addImm(SrcX)
    374             .addImm(SrcY)
    375             .addImm(SrcZ)
    376             .addImm(SrcW)
    377             .addImm(0)
    378             .addImm(0)
    379             .addImm(0)
    380             .addImm(0)
    381             .addImm(1)
    382             .addImm(2)
    383             .addImm(3)
    384             .addOperand(RID)
    385             .addOperand(SID)
    386             .addImm(CTX)
    387             .addImm(CTY)
    388             .addImm(CTZ)
    389             .addImm(CTW)
    390             .addReg(T0, RegState::Implicit)
    391             .addReg(T1, RegState::Implicit);
    392     break;
    393   }
    394 
    395   case AMDGPU::BRANCH:
    396       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
    397               .addOperand(MI->getOperand(0));
    398       break;
    399 
    400   case AMDGPU::BRANCH_COND_f32: {
    401     MachineInstr *NewMI =
    402       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
    403               AMDGPU::PREDICATE_BIT)
    404               .addOperand(MI->getOperand(1))
    405               .addImm(OPCODE_IS_NOT_ZERO)
    406               .addImm(0); // Flags
    407     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
    408     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
    409             .addOperand(MI->getOperand(0))
    410             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
    411     break;
    412   }
    413 
    414   case AMDGPU::BRANCH_COND_i32: {
    415     MachineInstr *NewMI =
    416       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
    417             AMDGPU::PREDICATE_BIT)
    418             .addOperand(MI->getOperand(1))
    419             .addImm(OPCODE_IS_NOT_ZERO_INT)
    420             .addImm(0); // Flags
    421     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
    422     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
    423            .addOperand(MI->getOperand(0))
    424             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
    425     break;
    426   }
    427 
    428   case AMDGPU::EG_ExportSwz:
    429   case AMDGPU::R600_ExportSwz: {
    430     // Instruction is left unmodified if its not the last one of its type
    431     bool isLastInstructionOfItsType = true;
    432     unsigned InstExportType = MI->getOperand(1).getImm();
    433     for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
    434          EndBlock = BB->end(); NextExportInst != EndBlock;
    435          NextExportInst = llvm::next(NextExportInst)) {
    436       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
    437           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
    438         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
    439             .getImm();
    440         if (CurrentInstExportType == InstExportType) {
    441           isLastInstructionOfItsType = false;
    442           break;
    443         }
    444       }
    445     }
    446     bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
    447     if (!EOP && !isLastInstructionOfItsType)
    448       return BB;
    449     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
    450     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
    451             .addOperand(MI->getOperand(0))
    452             .addOperand(MI->getOperand(1))
    453             .addOperand(MI->getOperand(2))
    454             .addOperand(MI->getOperand(3))
    455             .addOperand(MI->getOperand(4))
    456             .addOperand(MI->getOperand(5))
    457             .addOperand(MI->getOperand(6))
    458             .addImm(CfInst)
    459             .addImm(EOP);
    460     break;
    461   }
    462   case AMDGPU::RETURN: {
    463     // RETURN instructions must have the live-out registers as implicit uses,
    464     // otherwise they appear dead.
    465     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
    466     MachineInstrBuilder MIB(*MF, MI);
    467     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
    468       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
    469     return BB;
    470   }
    471   }
    472 
    473   MI->eraseFromParent();
    474   return BB;
    475 }
    476 
    477 //===----------------------------------------------------------------------===//
    478 // Custom DAG Lowering Operations
    479 //===----------------------------------------------------------------------===//
    480 
    481 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    482   MachineFunction &MF = DAG.getMachineFunction();
    483   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
    484   switch (Op.getOpcode()) {
    485   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
    486   case ISD::FCOS:
    487   case ISD::FSIN: return LowerTrig(Op, DAG);
    488   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
    489   case ISD::SELECT: return LowerSELECT(Op, DAG);
    490   case ISD::STORE: return LowerSTORE(Op, DAG);
    491   case ISD::LOAD: return LowerLOAD(Op, DAG);
    492   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
    493   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
    494   case ISD::INTRINSIC_VOID: {
    495     SDValue Chain = Op.getOperand(0);
    496     unsigned IntrinsicID =
    497                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    498     switch (IntrinsicID) {
    499     case AMDGPUIntrinsic::AMDGPU_store_output: {
    500       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
    501       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
    502       MFI->LiveOuts.push_back(Reg);
    503       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
    504     }
    505     case AMDGPUIntrinsic::R600_store_swizzle: {
    506       const SDValue Args[8] = {
    507         Chain,
    508         Op.getOperand(2), // Export Value
    509         Op.getOperand(3), // ArrayBase
    510         Op.getOperand(4), // Type
    511         DAG.getConstant(0, MVT::i32), // SWZ_X
    512         DAG.getConstant(1, MVT::i32), // SWZ_Y
    513         DAG.getConstant(2, MVT::i32), // SWZ_Z
    514         DAG.getConstant(3, MVT::i32) // SWZ_W
    515       };
    516       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
    517           Args, 8);
    518     }
    519 
    520     // default for switch(IntrinsicID)
    521     default: break;
    522     }
    523     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
    524     break;
    525   }
    526   case ISD::INTRINSIC_WO_CHAIN: {
    527     unsigned IntrinsicID =
    528                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    529     EVT VT = Op.getValueType();
    530     SDLoc DL(Op);
    531     switch(IntrinsicID) {
    532     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
    533     case AMDGPUIntrinsic::R600_load_input: {
    534       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    535       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
    536       MachineFunction &MF = DAG.getMachineFunction();
    537       MachineRegisterInfo &MRI = MF.getRegInfo();
    538       MRI.addLiveIn(Reg);
    539       return DAG.getCopyFromReg(DAG.getEntryNode(),
    540           SDLoc(DAG.getEntryNode()), Reg, VT);
    541     }
    542 
    543     case AMDGPUIntrinsic::R600_interp_input: {
    544       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    545       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
    546       MachineSDNode *interp;
    547       if (ijb < 0) {
    548         const MachineFunction &MF = DAG.getMachineFunction();
    549         const R600InstrInfo *TII =
    550           static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
    551         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
    552             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
    553         return DAG.getTargetExtractSubreg(
    554             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
    555             DL, MVT::f32, SDValue(interp, 0));
    556       }
    557 
    558       MachineFunction &MF = DAG.getMachineFunction();
    559       MachineRegisterInfo &MRI = MF.getRegInfo();
    560       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
    561       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
    562       MRI.addLiveIn(RegisterI);
    563       MRI.addLiveIn(RegisterJ);
    564       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
    565           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
    566       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
    567           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
    568 
    569       if (slot % 4 < 2)
    570         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
    571             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
    572             RegisterJNode, RegisterINode);
    573       else
    574         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
    575             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
    576             RegisterJNode, RegisterINode);
    577       return SDValue(interp, slot % 2);
    578     }
    579     case AMDGPUIntrinsic::R600_tex:
    580     case AMDGPUIntrinsic::R600_texc:
    581     case AMDGPUIntrinsic::R600_txl:
    582     case AMDGPUIntrinsic::R600_txlc:
    583     case AMDGPUIntrinsic::R600_txb:
    584     case AMDGPUIntrinsic::R600_txbc:
    585     case AMDGPUIntrinsic::R600_txf:
    586     case AMDGPUIntrinsic::R600_txq:
    587     case AMDGPUIntrinsic::R600_ddx:
    588     case AMDGPUIntrinsic::R600_ddy: {
    589       unsigned TextureOp;
    590       switch (IntrinsicID) {
    591       case AMDGPUIntrinsic::R600_tex:
    592         TextureOp = 0;
    593         break;
    594       case AMDGPUIntrinsic::R600_texc:
    595         TextureOp = 1;
    596         break;
    597       case AMDGPUIntrinsic::R600_txl:
    598         TextureOp = 2;
    599         break;
    600       case AMDGPUIntrinsic::R600_txlc:
    601         TextureOp = 3;
    602         break;
    603       case AMDGPUIntrinsic::R600_txb:
    604         TextureOp = 4;
    605         break;
    606       case AMDGPUIntrinsic::R600_txbc:
    607         TextureOp = 5;
    608         break;
    609       case AMDGPUIntrinsic::R600_txf:
    610         TextureOp = 6;
    611         break;
    612       case AMDGPUIntrinsic::R600_txq:
    613         TextureOp = 7;
    614         break;
    615       case AMDGPUIntrinsic::R600_ddx:
    616         TextureOp = 8;
    617         break;
    618       case AMDGPUIntrinsic::R600_ddy:
    619         TextureOp = 9;
    620         break;
    621       default:
    622         llvm_unreachable("Unknow Texture Operation");
    623       }
    624 
    625       SDValue TexArgs[19] = {
    626         DAG.getConstant(TextureOp, MVT::i32),
    627         Op.getOperand(1),
    628         DAG.getConstant(0, MVT::i32),
    629         DAG.getConstant(1, MVT::i32),
    630         DAG.getConstant(2, MVT::i32),
    631         DAG.getConstant(3, MVT::i32),
    632         Op.getOperand(2),
    633         Op.getOperand(3),
    634         Op.getOperand(4),
    635         DAG.getConstant(0, MVT::i32),
    636         DAG.getConstant(1, MVT::i32),
    637         DAG.getConstant(2, MVT::i32),
    638         DAG.getConstant(3, MVT::i32),
    639         Op.getOperand(5),
    640         Op.getOperand(6),
    641         Op.getOperand(7),
    642         Op.getOperand(8),
    643         Op.getOperand(9),
    644         Op.getOperand(10)
    645       };
    646       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
    647     }
    648     case AMDGPUIntrinsic::AMDGPU_dp4: {
    649       SDValue Args[8] = {
    650       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
    651           DAG.getConstant(0, MVT::i32)),
    652       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
    653           DAG.getConstant(0, MVT::i32)),
    654       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
    655           DAG.getConstant(1, MVT::i32)),
    656       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
    657           DAG.getConstant(1, MVT::i32)),
    658       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
    659           DAG.getConstant(2, MVT::i32)),
    660       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
    661           DAG.getConstant(2, MVT::i32)),
    662       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
    663           DAG.getConstant(3, MVT::i32)),
    664       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
    665           DAG.getConstant(3, MVT::i32))
    666       };
    667       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
    668     }
    669 
    670     case Intrinsic::r600_read_ngroups_x:
    671       return LowerImplicitParameter(DAG, VT, DL, 0);
    672     case Intrinsic::r600_read_ngroups_y:
    673       return LowerImplicitParameter(DAG, VT, DL, 1);
    674     case Intrinsic::r600_read_ngroups_z:
    675       return LowerImplicitParameter(DAG, VT, DL, 2);
    676     case Intrinsic::r600_read_global_size_x:
    677       return LowerImplicitParameter(DAG, VT, DL, 3);
    678     case Intrinsic::r600_read_global_size_y:
    679       return LowerImplicitParameter(DAG, VT, DL, 4);
    680     case Intrinsic::r600_read_global_size_z:
    681       return LowerImplicitParameter(DAG, VT, DL, 5);
    682     case Intrinsic::r600_read_local_size_x:
    683       return LowerImplicitParameter(DAG, VT, DL, 6);
    684     case Intrinsic::r600_read_local_size_y:
    685       return LowerImplicitParameter(DAG, VT, DL, 7);
    686     case Intrinsic::r600_read_local_size_z:
    687       return LowerImplicitParameter(DAG, VT, DL, 8);
    688 
    689     case Intrinsic::r600_read_tgid_x:
    690       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    691                                   AMDGPU::T1_X, VT);
    692     case Intrinsic::r600_read_tgid_y:
    693       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    694                                   AMDGPU::T1_Y, VT);
    695     case Intrinsic::r600_read_tgid_z:
    696       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    697                                   AMDGPU::T1_Z, VT);
    698     case Intrinsic::r600_read_tidig_x:
    699       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    700                                   AMDGPU::T0_X, VT);
    701     case Intrinsic::r600_read_tidig_y:
    702       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    703                                   AMDGPU::T0_Y, VT);
    704     case Intrinsic::r600_read_tidig_z:
    705       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
    706                                   AMDGPU::T0_Z, VT);
    707     }
    708     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
    709     break;
    710   }
    711   } // end switch(Op.getOpcode())
    712   return SDValue();
    713 }
    714 
    715 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
    716                                             SmallVectorImpl<SDValue> &Results,
    717                                             SelectionDAG &DAG) const {
    718   switch (N->getOpcode()) {
    719   default: return;
    720   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
    721     return;
    722   case ISD::LOAD: {
    723     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
    724     Results.push_back(SDValue(Node, 0));
    725     Results.push_back(SDValue(Node, 1));
    726     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
    727     // function
    728     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
    729     return;
    730   }
    731   case ISD::STORE:
    732     SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
    733     Results.push_back(SDValue(Node, 0));
    734     return;
    735   }
    736 }
    737 
    738 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
    739   // On hw >= R700, COS/SIN input must be between -1. and 1.
    740   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
    741   EVT VT = Op.getValueType();
    742   SDValue Arg = Op.getOperand(0);
    743   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
    744       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
    745         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
    746           DAG.getConstantFP(0.15915494309, MVT::f32)),
    747         DAG.getConstantFP(0.5, MVT::f32)));
    748   unsigned TrigNode;
    749   switch (Op.getOpcode()) {
    750   case ISD::FCOS:
    751     TrigNode = AMDGPUISD::COS_HW;
    752     break;
    753   case ISD::FSIN:
    754     TrigNode = AMDGPUISD::SIN_HW;
    755     break;
    756   default:
    757     llvm_unreachable("Wrong trig opcode");
    758   }
    759   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
    760       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
    761         DAG.getConstantFP(-0.5, MVT::f32)));
    762   if (Gen >= AMDGPUSubtarget::R700)
    763     return TrigVal;
    764   // On R600 hw, COS/SIN input must be between -Pi and Pi.
    765   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
    766       DAG.getConstantFP(3.14159265359, MVT::f32));
    767 }
    768 
    769 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
    770   return DAG.getNode(
    771       ISD::SETCC,
    772       SDLoc(Op),
    773       MVT::i1,
    774       Op, DAG.getConstantFP(0.0f, MVT::f32),
    775       DAG.getCondCode(ISD::SETNE)
    776       );
    777 }
    778 
    779 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
    780                                                    SDLoc DL,
    781                                                    unsigned DwordOffset) const {
    782   unsigned ByteOffset = DwordOffset * 4;
    783   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
    784                                       AMDGPUAS::CONSTANT_BUFFER_0);
    785 
    786   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
    787   assert(isInt<16>(ByteOffset));
    788 
    789   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
    790                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
    791                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
    792                      false, false, false, 0);
    793 }
    794 
    795 SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
    796 
    797   MachineFunction &MF = DAG.getMachineFunction();
    798   const AMDGPUFrameLowering *TFL =
    799    static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
    800 
    801   FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
    802   assert(FIN);
    803 
    804   unsigned FrameIndex = FIN->getIndex();
    805   unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
    806   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
    807 }
    808 
    809 bool R600TargetLowering::isZero(SDValue Op) const {
    810   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
    811     return Cst->isNullValue();
    812   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
    813     return CstFP->isZero();
    814   } else {
    815     return false;
    816   }
    817 }
    818 
    819 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
    820   SDLoc DL(Op);
    821   EVT VT = Op.getValueType();
    822 
    823   SDValue LHS = Op.getOperand(0);
    824   SDValue RHS = Op.getOperand(1);
    825   SDValue True = Op.getOperand(2);
    826   SDValue False = Op.getOperand(3);
    827   SDValue CC = Op.getOperand(4);
    828   SDValue Temp;
    829 
    830   // LHS and RHS are guaranteed to be the same value type
    831   EVT CompareVT = LHS.getValueType();
    832 
    833   // Check if we can lower this to a native operation.
    834 
    835   // Try to lower to a SET* instruction:
    836   //
    837   // SET* can match the following patterns:
    838   //
    839   // select_cc f32, f32, -1,  0, cc_any
    840   // select_cc f32, f32, 1.0f, 0.0f, cc_any
    841   // select_cc i32, i32, -1,  0, cc_any
    842   //
    843 
    844   // Move hardware True/False values to the correct operand.
    845   if (isHWTrueValue(False) && isHWFalseValue(True)) {
    846     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
    847     std::swap(False, True);
    848     CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
    849   }
    850 
    851   if (isHWTrueValue(True) && isHWFalseValue(False) &&
    852       (CompareVT == VT || VT == MVT::i32)) {
    853     // This can be matched by a SET* instruction.
    854     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
    855   }
    856 
    857   // Try to lower to a CND* instruction:
    858   //
    859   // CND* can match the following patterns:
    860   //
    861   // select_cc f32, 0.0, f32, f32, cc_any
    862   // select_cc f32, 0.0, i32, i32, cc_any
    863   // select_cc i32, 0,   f32, f32, cc_any
    864   // select_cc i32, 0,   i32, i32, cc_any
    865   //
    866   if (isZero(LHS) || isZero(RHS)) {
    867     SDValue Cond = (isZero(LHS) ? RHS : LHS);
    868     SDValue Zero = (isZero(LHS) ? LHS : RHS);
    869     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
    870     if (CompareVT != VT) {
    871       // Bitcast True / False to the correct types.  This will end up being
    872       // a nop, but it allows us to define only a single pattern in the
    873       // .TD files for each CND* instruction rather than having to have
    874       // one pattern for integer True/False and one for fp True/False
    875       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
    876       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
    877     }
    878     if (isZero(LHS)) {
    879       CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
    880     }
    881 
    882     switch (CCOpcode) {
    883     case ISD::SETONE:
    884     case ISD::SETUNE:
    885     case ISD::SETNE:
    886     case ISD::SETULE:
    887     case ISD::SETULT:
    888     case ISD::SETOLE:
    889     case ISD::SETOLT:
    890     case ISD::SETLE:
    891     case ISD::SETLT:
    892       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
    893       Temp = True;
    894       True = False;
    895       False = Temp;
    896       break;
    897     default:
    898       break;
    899     }
    900     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
    901         Cond, Zero,
    902         True, False,
    903         DAG.getCondCode(CCOpcode));
    904     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
    905   }
    906 
    907 
    908   // Possible Min/Max pattern
    909   SDValue MinMax = LowerMinMax(Op, DAG);
    910   if (MinMax.getNode()) {
    911     return MinMax;
    912   }
    913 
    914   // If we make it this for it means we have no native instructions to handle
    915   // this SELECT_CC, so we must lower it.
    916   SDValue HWTrue, HWFalse;
    917 
    918   if (CompareVT == MVT::f32) {
    919     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
    920     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
    921   } else if (CompareVT == MVT::i32) {
    922     HWTrue = DAG.getConstant(-1, CompareVT);
    923     HWFalse = DAG.getConstant(0, CompareVT);
    924   }
    925   else {
    926     assert(!"Unhandled value type in LowerSELECT_CC");
    927   }
    928 
    929   // Lower this unsupported SELECT_CC into a combination of two supported
    930   // SELECT_CC operations.
    931   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
    932 
    933   return DAG.getNode(ISD::SELECT_CC, DL, VT,
    934       Cond, HWFalse,
    935       True, False,
    936       DAG.getCondCode(ISD::SETNE));
    937 }
    938 
    939 SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
    940   return DAG.getNode(ISD::SELECT_CC,
    941       SDLoc(Op),
    942       Op.getValueType(),
    943       Op.getOperand(0),
    944       DAG.getConstant(0, MVT::i32),
    945       Op.getOperand(1),
    946       Op.getOperand(2),
    947       DAG.getCondCode(ISD::SETNE));
    948 }
    949 
    950 /// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
    951 /// convert these pointers to a register index.  Each register holds
    952 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
    953 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
    954 /// for indirect addressing.
    955 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
    956                                                unsigned StackWidth,
    957                                                SelectionDAG &DAG) const {
    958   unsigned SRLPad;
    959   switch(StackWidth) {
    960   case 1:
    961     SRLPad = 2;
    962     break;
    963   case 2:
    964     SRLPad = 3;
    965     break;
    966   case 4:
    967     SRLPad = 4;
    968     break;
    969   default: llvm_unreachable("Invalid stack width");
    970   }
    971 
    972   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
    973                      DAG.getConstant(SRLPad, MVT::i32));
    974 }
    975 
    976 void R600TargetLowering::getStackAddress(unsigned StackWidth,
    977                                          unsigned ElemIdx,
    978                                          unsigned &Channel,
    979                                          unsigned &PtrIncr) const {
    980   switch (StackWidth) {
    981   default:
    982   case 1:
    983     Channel = 0;
    984     if (ElemIdx > 0) {
    985       PtrIncr = 1;
    986     } else {
    987       PtrIncr = 0;
    988     }
    989     break;
    990   case 2:
    991     Channel = ElemIdx % 2;
    992     if (ElemIdx == 2) {
    993       PtrIncr = 1;
    994     } else {
    995       PtrIncr = 0;
    996     }
    997     break;
    998   case 4:
    999     Channel = ElemIdx;
   1000     PtrIncr = 0;
   1001     break;
   1002   }
   1003 }
   1004 
   1005 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   1006   SDLoc DL(Op);
   1007   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
   1008   SDValue Chain = Op.getOperand(0);
   1009   SDValue Value = Op.getOperand(1);
   1010   SDValue Ptr = Op.getOperand(2);
   1011 
   1012   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
   1013       Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
   1014     // Convert pointer from byte address to dword address.
   1015     Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
   1016                       DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
   1017                                   Ptr, DAG.getConstant(2, MVT::i32)));
   1018 
   1019     if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
   1020       assert(!"Truncated and indexed stores not supported yet");
   1021     } else {
   1022       Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
   1023     }
   1024     return Chain;
   1025   }
   1026 
   1027   EVT ValueVT = Value.getValueType();
   1028 
   1029   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
   1030     return SDValue();
   1031   }
   1032 
   1033   // Lowering for indirect addressing
   1034 
   1035   const MachineFunction &MF = DAG.getMachineFunction();
   1036   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
   1037                                          getTargetMachine().getFrameLowering());
   1038   unsigned StackWidth = TFL->getStackWidth(MF);
   1039 
   1040   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
   1041 
   1042   if (ValueVT.isVector()) {
   1043     unsigned NumElemVT = ValueVT.getVectorNumElements();
   1044     EVT ElemVT = ValueVT.getVectorElementType();
   1045     SDValue Stores[4];
   1046 
   1047     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
   1048                                       "vector width in load");
   1049 
   1050     for (unsigned i = 0; i < NumElemVT; ++i) {
   1051       unsigned Channel, PtrIncr;
   1052       getStackAddress(StackWidth, i, Channel, PtrIncr);
   1053       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
   1054                         DAG.getConstant(PtrIncr, MVT::i32));
   1055       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
   1056                                  Value, DAG.getConstant(i, MVT::i32));
   1057 
   1058       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
   1059                               Chain, Elem, Ptr,
   1060                               DAG.getTargetConstant(Channel, MVT::i32));
   1061     }
   1062      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
   1063    } else {
   1064     if (ValueVT == MVT::i8) {
   1065       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
   1066     }
   1067     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
   1068     DAG.getTargetConstant(0, MVT::i32)); // Channel
   1069   }
   1070 
   1071   return Chain;
   1072 }
   1073 
   1074 // return (512 + (kc_bank << 12)
   1075 static int
   1076 ConstantAddressBlock(unsigned AddressSpace) {
   1077   switch (AddressSpace) {
   1078   case AMDGPUAS::CONSTANT_BUFFER_0:
   1079     return 512;
   1080   case AMDGPUAS::CONSTANT_BUFFER_1:
   1081     return 512 + 4096;
   1082   case AMDGPUAS::CONSTANT_BUFFER_2:
   1083     return 512 + 4096 * 2;
   1084   case AMDGPUAS::CONSTANT_BUFFER_3:
   1085     return 512 + 4096 * 3;
   1086   case AMDGPUAS::CONSTANT_BUFFER_4:
   1087     return 512 + 4096 * 4;
   1088   case AMDGPUAS::CONSTANT_BUFFER_5:
   1089     return 512 + 4096 * 5;
   1090   case AMDGPUAS::CONSTANT_BUFFER_6:
   1091     return 512 + 4096 * 6;
   1092   case AMDGPUAS::CONSTANT_BUFFER_7:
   1093     return 512 + 4096 * 7;
   1094   case AMDGPUAS::CONSTANT_BUFFER_8:
   1095     return 512 + 4096 * 8;
   1096   case AMDGPUAS::CONSTANT_BUFFER_9:
   1097     return 512 + 4096 * 9;
   1098   case AMDGPUAS::CONSTANT_BUFFER_10:
   1099     return 512 + 4096 * 10;
   1100   case AMDGPUAS::CONSTANT_BUFFER_11:
   1101     return 512 + 4096 * 11;
   1102   case AMDGPUAS::CONSTANT_BUFFER_12:
   1103     return 512 + 4096 * 12;
   1104   case AMDGPUAS::CONSTANT_BUFFER_13:
   1105     return 512 + 4096 * 13;
   1106   case AMDGPUAS::CONSTANT_BUFFER_14:
   1107     return 512 + 4096 * 14;
   1108   case AMDGPUAS::CONSTANT_BUFFER_15:
   1109     return 512 + 4096 * 15;
   1110   default:
   1111     return -1;
   1112   }
   1113 }
   1114 
   1115 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
   1116 {
   1117   EVT VT = Op.getValueType();
   1118   SDLoc DL(Op);
   1119   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
   1120   SDValue Chain = Op.getOperand(0);
   1121   SDValue Ptr = Op.getOperand(1);
   1122   SDValue LoweredLoad;
   1123 
   1124   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
   1125   if (ConstantBlock > -1) {
   1126     SDValue Result;
   1127     if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
   1128         dyn_cast<Constant>(LoadNode->getSrcValue()) ||
   1129         dyn_cast<ConstantSDNode>(Ptr)) {
   1130       SDValue Slots[4];
   1131       for (unsigned i = 0; i < 4; i++) {
   1132         // We want Const position encoded with the following formula :
   1133         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
   1134         // const_index is Ptr computed by llvm using an alignment of 16.
   1135         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
   1136         // then div by 4 at the ISel step
   1137         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
   1138             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
   1139         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
   1140       }
   1141       EVT NewVT = MVT::v4i32;
   1142       unsigned NumElements = 4;
   1143       if (VT.isVector()) {
   1144         NewVT = VT;
   1145         NumElements = VT.getVectorNumElements();
   1146       }
   1147       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements);
   1148     } else {
   1149       // non constant ptr cant be folded, keeps it as a v4f32 load
   1150       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
   1151           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
   1152           DAG.getConstant(LoadNode->getAddressSpace() -
   1153                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
   1154           );
   1155     }
   1156 
   1157     if (!VT.isVector()) {
   1158       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
   1159           DAG.getConstant(0, MVT::i32));
   1160     }
   1161 
   1162     SDValue MergedValues[2] = {
   1163         Result,
   1164         Chain
   1165     };
   1166     return DAG.getMergeValues(MergedValues, 2, DL);
   1167   }
   1168 
   1169   // For most operations returning SDValue() will result int he node being
   1170   // expanded by the DAG Legalizer.  This is not the case for ISD::LOAD, so
   1171   // we need to manually expand loads that may be legal in some address spaces
   1172   // and illegal in others.  SEXT loads from CONSTANT_BUFFER_0 are supported
   1173   // for compute shaders, since the data is sign extended when it is uploaded
   1174   // to the buffer.  Howerver SEXT loads from other addresspaces are not
   1175   // supported, so we need to expand them here.
   1176   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
   1177     EVT MemVT = LoadNode->getMemoryVT();
   1178     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
   1179     SDValue ShiftAmount =
   1180           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
   1181     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
   1182                                   LoadNode->getPointerInfo(), MemVT,
   1183                                   LoadNode->isVolatile(),
   1184                                   LoadNode->isNonTemporal(),
   1185                                   LoadNode->getAlignment());
   1186     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
   1187     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
   1188 
   1189     SDValue MergedValues[2] = { Sra, Chain };
   1190     return DAG.getMergeValues(MergedValues, 2, DL);
   1191   }
   1192 
   1193   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
   1194     return SDValue();
   1195   }
   1196 
   1197   // Lowering for indirect addressing
   1198   const MachineFunction &MF = DAG.getMachineFunction();
   1199   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
   1200                                          getTargetMachine().getFrameLowering());
   1201   unsigned StackWidth = TFL->getStackWidth(MF);
   1202 
   1203   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
   1204 
   1205   if (VT.isVector()) {
   1206     unsigned NumElemVT = VT.getVectorNumElements();
   1207     EVT ElemVT = VT.getVectorElementType();
   1208     SDValue Loads[4];
   1209 
   1210     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
   1211                                       "vector width in load");
   1212 
   1213     for (unsigned i = 0; i < NumElemVT; ++i) {
   1214       unsigned Channel, PtrIncr;
   1215       getStackAddress(StackWidth, i, Channel, PtrIncr);
   1216       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
   1217                         DAG.getConstant(PtrIncr, MVT::i32));
   1218       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
   1219                              Chain, Ptr,
   1220                              DAG.getTargetConstant(Channel, MVT::i32),
   1221                              Op.getOperand(2));
   1222     }
   1223     for (unsigned i = NumElemVT; i < 4; ++i) {
   1224       Loads[i] = DAG.getUNDEF(ElemVT);
   1225     }
   1226     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
   1227     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
   1228   } else {
   1229     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
   1230                               Chain, Ptr,
   1231                               DAG.getTargetConstant(0, MVT::i32), // Channel
   1232                               Op.getOperand(2));
   1233   }
   1234 
   1235   SDValue Ops[2];
   1236   Ops[0] = LoweredLoad;
   1237   Ops[1] = Chain;
   1238 
   1239   return DAG.getMergeValues(Ops, 2, DL);
   1240 }
   1241 
   1242 /// XXX Only kernel functions are supported, so we can assume for now that
   1243 /// every function is a kernel function, but in the future we should use
   1244 /// separate calling conventions for kernel and non-kernel functions.
   1245 SDValue R600TargetLowering::LowerFormalArguments(
   1246                                       SDValue Chain,
   1247                                       CallingConv::ID CallConv,
   1248                                       bool isVarArg,
   1249                                       const SmallVectorImpl<ISD::InputArg> &Ins,
   1250                                       SDLoc DL, SelectionDAG &DAG,
   1251                                       SmallVectorImpl<SDValue> &InVals) const {
   1252   SmallVector<CCValAssign, 16> ArgLocs;
   1253   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
   1254                  getTargetMachine(), ArgLocs, *DAG.getContext());
   1255 
   1256   AnalyzeFormalArguments(CCInfo, Ins);
   1257 
   1258   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
   1259     CCValAssign &VA = ArgLocs[i];
   1260     EVT VT = VA.getLocVT();
   1261 
   1262     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
   1263                                                    AMDGPUAS::CONSTANT_BUFFER_0);
   1264 
   1265     // The first 36 bytes of the input buffer contains information about
   1266     // thread group and global sizes.
   1267     SDValue Arg = DAG.getLoad(VT, DL, Chain,
   1268                            DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
   1269                            MachinePointerInfo(UndefValue::get(PtrTy)), false,
   1270                            false, false, 4); // 4 is the prefered alignment for
   1271                                              // the CONSTANT memory space.
   1272     InVals.push_back(Arg);
   1273   }
   1274   return Chain;
   1275 }
   1276 
   1277 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   1278    if (!VT.isVector()) return MVT::i32;
   1279    return VT.changeVectorElementTypeToInteger();
   1280 }
   1281 
   1282 static SDValue
   1283 CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
   1284                         DenseMap<unsigned, unsigned> &RemapSwizzle) {
   1285   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
   1286   assert(RemapSwizzle.empty());
   1287   SDValue NewBldVec[4] = {
   1288       VectorEntry.getOperand(0),
   1289       VectorEntry.getOperand(1),
   1290       VectorEntry.getOperand(2),
   1291       VectorEntry.getOperand(3)
   1292   };
   1293 
   1294   for (unsigned i = 0; i < 4; i++) {
   1295     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
   1296       if (C->isZero()) {
   1297         RemapSwizzle[i] = 4; // SEL_0
   1298         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
   1299       } else if (C->isExactlyValue(1.0)) {
   1300         RemapSwizzle[i] = 5; // SEL_1
   1301         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
   1302       }
   1303     }
   1304 
   1305     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
   1306       continue;
   1307     for (unsigned j = 0; j < i; j++) {
   1308       if (NewBldVec[i] == NewBldVec[j]) {
   1309         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
   1310         RemapSwizzle[i] = j;
   1311         break;
   1312       }
   1313     }
   1314   }
   1315 
   1316   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
   1317       VectorEntry.getValueType(), NewBldVec, 4);
   1318 }
   1319 
   1320 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
   1321                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
   1322   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
   1323   assert(RemapSwizzle.empty());
   1324   SDValue NewBldVec[4] = {
   1325       VectorEntry.getOperand(0),
   1326       VectorEntry.getOperand(1),
   1327       VectorEntry.getOperand(2),
   1328       VectorEntry.getOperand(3)
   1329   };
   1330   bool isUnmovable[4] = { false, false, false, false };
   1331   for (unsigned i = 0; i < 4; i++)
   1332     RemapSwizzle[i] = i;
   1333 
   1334   for (unsigned i = 0; i < 4; i++) {
   1335     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
   1336       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
   1337           ->getZExtValue();
   1338       if (!isUnmovable[Idx]) {
   1339         // Swap i and Idx
   1340         std::swap(NewBldVec[Idx], NewBldVec[i]);
   1341         std::swap(RemapSwizzle[RemapSwizzle[Idx]], RemapSwizzle[RemapSwizzle[i]]);
   1342       }
   1343       isUnmovable[Idx] = true;
   1344     }
   1345   }
   1346 
   1347   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
   1348       VectorEntry.getValueType(), NewBldVec, 4);
   1349 }
   1350 
   1351 
   1352 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
   1353 SDValue Swz[4], SelectionDAG &DAG) const {
   1354   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
   1355   // Old -> New swizzle values
   1356   DenseMap<unsigned, unsigned> SwizzleRemap;
   1357 
   1358   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
   1359   for (unsigned i = 0; i < 4; i++) {
   1360     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
   1361     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
   1362       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
   1363   }
   1364 
   1365   SwizzleRemap.clear();
   1366   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
   1367   for (unsigned i = 0; i < 4; i++) {
   1368     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
   1369     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
   1370       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
   1371   }
   1372 
   1373   return BuildVector;
   1374 }
   1375 
   1376 
   1377 //===----------------------------------------------------------------------===//
   1378 // Custom DAG Optimizations
   1379 //===----------------------------------------------------------------------===//
   1380 
   1381 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
   1382                                               DAGCombinerInfo &DCI) const {
   1383   SelectionDAG &DAG = DCI.DAG;
   1384 
   1385   switch (N->getOpcode()) {
   1386   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
   1387   case ISD::FP_ROUND: {
   1388       SDValue Arg = N->getOperand(0);
   1389       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
   1390         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
   1391                            Arg.getOperand(0));
   1392       }
   1393       break;
   1394     }
   1395 
   1396   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
   1397   // (i32 select_cc f32, f32, -1, 0 cc)
   1398   //
   1399   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
   1400   // this to one of the SET*_DX10 instructions.
   1401   case ISD::FP_TO_SINT: {
   1402     SDValue FNeg = N->getOperand(0);
   1403     if (FNeg.getOpcode() != ISD::FNEG) {
   1404       return SDValue();
   1405     }
   1406     SDValue SelectCC = FNeg.getOperand(0);
   1407     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
   1408         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
   1409         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
   1410         !isHWTrueValue(SelectCC.getOperand(2)) ||
   1411         !isHWFalseValue(SelectCC.getOperand(3))) {
   1412       return SDValue();
   1413     }
   1414 
   1415     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
   1416                            SelectCC.getOperand(0), // LHS
   1417                            SelectCC.getOperand(1), // RHS
   1418                            DAG.getConstant(-1, MVT::i32), // True
   1419                            DAG.getConstant(0, MVT::i32),  // Flase
   1420                            SelectCC.getOperand(4)); // CC
   1421 
   1422     break;
   1423   }
   1424 
   1425   // insert_vector_elt (build_vector elt0, , eltN), NewEltIdx, idx
   1426   // => build_vector elt0, , NewEltIdx, , eltN
   1427   case ISD::INSERT_VECTOR_ELT: {
   1428     SDValue InVec = N->getOperand(0);
   1429     SDValue InVal = N->getOperand(1);
   1430     SDValue EltNo = N->getOperand(2);
   1431     SDLoc dl(N);
   1432 
   1433     // If the inserted element is an UNDEF, just use the input vector.
   1434     if (InVal.getOpcode() == ISD::UNDEF)
   1435       return InVec;
   1436 
   1437     EVT VT = InVec.getValueType();
   1438 
   1439     // If we can't generate a legal BUILD_VECTOR, exit
   1440     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
   1441       return SDValue();
   1442 
   1443     // Check that we know which element is being inserted
   1444     if (!isa<ConstantSDNode>(EltNo))
   1445       return SDValue();
   1446     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
   1447 
   1448     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
   1449     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
   1450     // vector elements.
   1451     SmallVector<SDValue, 8> Ops;
   1452     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
   1453       Ops.append(InVec.getNode()->op_begin(),
   1454                  InVec.getNode()->op_end());
   1455     } else if (InVec.getOpcode() == ISD::UNDEF) {
   1456       unsigned NElts = VT.getVectorNumElements();
   1457       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
   1458     } else {
   1459       return SDValue();
   1460     }
   1461 
   1462     // Insert the element
   1463     if (Elt < Ops.size()) {
   1464       // All the operands of BUILD_VECTOR must have the same type;
   1465       // we enforce that here.
   1466       EVT OpVT = Ops[0].getValueType();
   1467       if (InVal.getValueType() != OpVT)
   1468         InVal = OpVT.bitsGT(InVal.getValueType()) ?
   1469           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
   1470           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
   1471       Ops[Elt] = InVal;
   1472     }
   1473 
   1474     // Return the new vector
   1475     return DAG.getNode(ISD::BUILD_VECTOR, dl,
   1476                        VT, &Ops[0], Ops.size());
   1477   }
   1478 
   1479   // Extract_vec (Build_vector) generated by custom lowering
   1480   // also needs to be customly combined
   1481   case ISD::EXTRACT_VECTOR_ELT: {
   1482     SDValue Arg = N->getOperand(0);
   1483     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
   1484       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
   1485         unsigned Element = Const->getZExtValue();
   1486         return Arg->getOperand(Element);
   1487       }
   1488     }
   1489     if (Arg.getOpcode() == ISD::BITCAST &&
   1490         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
   1491       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
   1492         unsigned Element = Const->getZExtValue();
   1493         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
   1494             Arg->getOperand(0).getOperand(Element));
   1495       }
   1496     }
   1497   }
   1498 
   1499   case ISD::SELECT_CC: {
   1500     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
   1501     //      selectcc x, y, a, b, inv(cc)
   1502     //
   1503     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
   1504     //      selectcc x, y, a, b, cc
   1505     SDValue LHS = N->getOperand(0);
   1506     if (LHS.getOpcode() != ISD::SELECT_CC) {
   1507       return SDValue();
   1508     }
   1509 
   1510     SDValue RHS = N->getOperand(1);
   1511     SDValue True = N->getOperand(2);
   1512     SDValue False = N->getOperand(3);
   1513     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
   1514 
   1515     if (LHS.getOperand(2).getNode() != True.getNode() ||
   1516         LHS.getOperand(3).getNode() != False.getNode() ||
   1517         RHS.getNode() != False.getNode()) {
   1518       return SDValue();
   1519     }
   1520 
   1521     switch (NCC) {
   1522     default: return SDValue();
   1523     case ISD::SETNE: return LHS;
   1524     case ISD::SETEQ: {
   1525       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
   1526       LHSCC = ISD::getSetCCInverse(LHSCC,
   1527                                   LHS.getOperand(0).getValueType().isInteger());
   1528       return DAG.getSelectCC(SDLoc(N),
   1529                              LHS.getOperand(0),
   1530                              LHS.getOperand(1),
   1531                              LHS.getOperand(2),
   1532                              LHS.getOperand(3),
   1533                              LHSCC);
   1534     }
   1535     }
   1536   }
   1537   case AMDGPUISD::EXPORT: {
   1538     SDValue Arg = N->getOperand(1);
   1539     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
   1540       break;
   1541 
   1542     SDValue NewArgs[8] = {
   1543       N->getOperand(0), // Chain
   1544       SDValue(),
   1545       N->getOperand(2), // ArrayBase
   1546       N->getOperand(3), // Type
   1547       N->getOperand(4), // SWZ_X
   1548       N->getOperand(5), // SWZ_Y
   1549       N->getOperand(6), // SWZ_Z
   1550       N->getOperand(7) // SWZ_W
   1551     };
   1552     SDLoc DL(N);
   1553     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
   1554     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
   1555   }
   1556   case AMDGPUISD::TEXTURE_FETCH: {
   1557     SDValue Arg = N->getOperand(1);
   1558     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
   1559       break;
   1560 
   1561     SDValue NewArgs[19] = {
   1562       N->getOperand(0),
   1563       N->getOperand(1),
   1564       N->getOperand(2),
   1565       N->getOperand(3),
   1566       N->getOperand(4),
   1567       N->getOperand(5),
   1568       N->getOperand(6),
   1569       N->getOperand(7),
   1570       N->getOperand(8),
   1571       N->getOperand(9),
   1572       N->getOperand(10),
   1573       N->getOperand(11),
   1574       N->getOperand(12),
   1575       N->getOperand(13),
   1576       N->getOperand(14),
   1577       N->getOperand(15),
   1578       N->getOperand(16),
   1579       N->getOperand(17),
   1580       N->getOperand(18),
   1581     };
   1582     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
   1583     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
   1584         NewArgs, 19);
   1585   }
   1586   }
   1587   return SDValue();
   1588 }
   1589