Home | History | Annotate | Download | only in R600
      1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 /// \brief Custom DAG lowering for SI
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "SIISelLowering.h"
     16 #include "AMDGPU.h"
     17 #include "AMDILIntrinsicInfo.h"
     18 #include "SIInstrInfo.h"
     19 #include "SIMachineFunctionInfo.h"
     20 #include "SIRegisterInfo.h"
     21 #include "llvm/CodeGen/CallingConvLower.h"
     22 #include "llvm/CodeGen/MachineInstrBuilder.h"
     23 #include "llvm/CodeGen/MachineRegisterInfo.h"
     24 #include "llvm/CodeGen/SelectionDAG.h"
     25 #include "llvm/IR/Function.h"
     26 
     27 const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
     28 
     29 using namespace llvm;
     30 
     31 SITargetLowering::SITargetLowering(TargetMachine &TM) :
     32     AMDGPUTargetLowering(TM) {
     33 
     34   addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass);
     35   addRegisterClass(MVT::i64, &AMDGPU::VSrc_64RegClass);
     36 
     37   addRegisterClass(MVT::v2i1, &AMDGPU::VReg_64RegClass);
     38   addRegisterClass(MVT::v4i1, &AMDGPU::VReg_128RegClass);
     39 
     40   addRegisterClass(MVT::v16i8, &AMDGPU::SReg_128RegClass);
     41   addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass);
     42   addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
     43 
     44   addRegisterClass(MVT::i32, &AMDGPU::VSrc_32RegClass);
     45   addRegisterClass(MVT::f32, &AMDGPU::VSrc_32RegClass);
     46 
     47   addRegisterClass(MVT::v1i32, &AMDGPU::VSrc_32RegClass);
     48 
     49   addRegisterClass(MVT::f64, &AMDGPU::VSrc_64RegClass);
     50   addRegisterClass(MVT::v2i32, &AMDGPU::VSrc_64RegClass);
     51   addRegisterClass(MVT::v2f32, &AMDGPU::VSrc_64RegClass);
     52 
     53   addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass);
     54   addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
     55   addRegisterClass(MVT::i128, &AMDGPU::SReg_128RegClass);
     56 
     57   addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass);
     58   addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
     59 
     60   addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass);
     61   addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
     62 
     63   computeRegisterProperties();
     64 
     65   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
     66   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
     67   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
     68   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
     69 
     70   setOperationAction(ISD::ADD, MVT::i64, Legal);
     71   setOperationAction(ISD::ADD, MVT::i32, Legal);
     72 
     73   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
     74   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
     75 
     76   setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
     77 
     78   setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
     79   setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
     80 
     81   setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom);
     82   setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom);
     83 
     84   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
     85 
     86   setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
     87 
     88   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
     89 
     90   setTargetDAGCombine(ISD::SELECT_CC);
     91 
     92   setTargetDAGCombine(ISD::SETCC);
     93 
     94   setSchedulingPreference(Sched::RegPressure);
     95 }
     96 
     97 //===----------------------------------------------------------------------===//
     98 // TargetLowering queries
     99 //===----------------------------------------------------------------------===//
    100 
    101 bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT  VT,
    102                                                      bool *IsFast) const {
    103   // XXX: This depends on the address space and also we may want to revist
    104   // the alignment values we specify in the DataLayout.
    105   return VT.bitsGT(MVT::i32);
    106 }
    107 
    108 
    109 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT,
    110                                          SDLoc DL, SDValue Chain,
    111                                          unsigned Offset) const {
    112   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
    113   PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
    114                                             AMDGPUAS::CONSTANT_ADDRESS);
    115   EVT ArgVT = MVT::getIntegerVT(VT.getSizeInBits());
    116   SDValue BasePtr =  DAG.getCopyFromReg(Chain, DL,
    117                            MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64);
    118   SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
    119                                              DAG.getConstant(Offset, MVT::i64));
    120   return DAG.getLoad(VT, DL, Chain, Ptr,
    121                             MachinePointerInfo(UndefValue::get(PtrTy)),
    122                             false, false, false, ArgVT.getSizeInBits() >> 3);
    123 
    124 }
    125 
    126 SDValue SITargetLowering::LowerFormalArguments(
    127                                       SDValue Chain,
    128                                       CallingConv::ID CallConv,
    129                                       bool isVarArg,
    130                                       const SmallVectorImpl<ISD::InputArg> &Ins,
    131                                       SDLoc DL, SelectionDAG &DAG,
    132                                       SmallVectorImpl<SDValue> &InVals) const {
    133 
    134   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
    135 
    136   MachineFunction &MF = DAG.getMachineFunction();
    137   FunctionType *FType = MF.getFunction()->getFunctionType();
    138   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    139 
    140   assert(CallConv == CallingConv::C);
    141 
    142   SmallVector<ISD::InputArg, 16> Splits;
    143   uint32_t Skipped = 0;
    144 
    145   for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
    146     const ISD::InputArg &Arg = Ins[i];
    147 
    148     // First check if it's a PS input addr
    149     if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg()) {
    150 
    151       assert((PSInputNum <= 15) && "Too many PS inputs!");
    152 
    153       if (!Arg.Used) {
    154         // We can savely skip PS inputs
    155         Skipped |= 1 << i;
    156         ++PSInputNum;
    157         continue;
    158       }
    159 
    160       Info->PSInputAddr |= 1 << PSInputNum++;
    161     }
    162 
    163     // Second split vertices into their elements
    164     if (Info->ShaderType != ShaderType::COMPUTE && Arg.VT.isVector()) {
    165       ISD::InputArg NewArg = Arg;
    166       NewArg.Flags.setSplit();
    167       NewArg.VT = Arg.VT.getVectorElementType();
    168 
    169       // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
    170       // three or five element vertex only needs three or five registers,
    171       // NOT four or eigth.
    172       Type *ParamType = FType->getParamType(Arg.OrigArgIndex);
    173       unsigned NumElements = ParamType->getVectorNumElements();
    174 
    175       for (unsigned j = 0; j != NumElements; ++j) {
    176         Splits.push_back(NewArg);
    177         NewArg.PartOffset += NewArg.VT.getStoreSize();
    178       }
    179 
    180     } else {
    181       Splits.push_back(Arg);
    182     }
    183   }
    184 
    185   SmallVector<CCValAssign, 16> ArgLocs;
    186   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
    187                  getTargetMachine(), ArgLocs, *DAG.getContext());
    188 
    189   // At least one interpolation mode must be enabled or else the GPU will hang.
    190   if (Info->ShaderType == ShaderType::PIXEL && (Info->PSInputAddr & 0x7F) == 0) {
    191     Info->PSInputAddr |= 1;
    192     CCInfo.AllocateReg(AMDGPU::VGPR0);
    193     CCInfo.AllocateReg(AMDGPU::VGPR1);
    194   }
    195 
    196   // The pointer to the list of arguments is stored in SGPR0, SGPR1
    197   if (Info->ShaderType == ShaderType::COMPUTE) {
    198     CCInfo.AllocateReg(AMDGPU::SGPR0);
    199     CCInfo.AllocateReg(AMDGPU::SGPR1);
    200     MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass);
    201   }
    202 
    203   AnalyzeFormalArguments(CCInfo, Splits);
    204 
    205   for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
    206 
    207     const ISD::InputArg &Arg = Ins[i];
    208     if (Skipped & (1 << i)) {
    209       InVals.push_back(DAG.getUNDEF(Arg.VT));
    210       continue;
    211     }
    212 
    213     CCValAssign &VA = ArgLocs[ArgIdx++];
    214     EVT VT = VA.getLocVT();
    215 
    216     if (VA.isMemLoc()) {
    217       // The first 36 bytes of the input buffer contains information about
    218       // thread group and global sizes.
    219       SDValue Arg = LowerParameter(DAG, VT, DL, DAG.getRoot(),
    220                                    36 + VA.getLocMemOffset());
    221       InVals.push_back(Arg);
    222       continue;
    223     }
    224     assert(VA.isRegLoc() && "Parameter must be in a register!");
    225 
    226     unsigned Reg = VA.getLocReg();
    227 
    228     if (VT == MVT::i64) {
    229       // For now assume it is a pointer
    230       Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
    231                                      &AMDGPU::SReg_64RegClass);
    232       Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass);
    233       InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
    234       continue;
    235     }
    236 
    237     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
    238 
    239     Reg = MF.addLiveIn(Reg, RC);
    240     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
    241 
    242     if (Arg.VT.isVector()) {
    243 
    244       // Build a vector from the registers
    245       Type *ParamType = FType->getParamType(Arg.OrigArgIndex);
    246       unsigned NumElements = ParamType->getVectorNumElements();
    247 
    248       SmallVector<SDValue, 4> Regs;
    249       Regs.push_back(Val);
    250       for (unsigned j = 1; j != NumElements; ++j) {
    251         Reg = ArgLocs[ArgIdx++].getLocReg();
    252         Reg = MF.addLiveIn(Reg, RC);
    253         Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
    254       }
    255 
    256       // Fill up the missing vector elements
    257       NumElements = Arg.VT.getVectorNumElements() - NumElements;
    258       for (unsigned j = 0; j != NumElements; ++j)
    259         Regs.push_back(DAG.getUNDEF(VT));
    260 
    261       InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT,
    262                                    Regs.data(), Regs.size()));
    263       continue;
    264     }
    265 
    266     InVals.push_back(Val);
    267   }
    268   return Chain;
    269 }
    270 
    271 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
    272     MachineInstr * MI, MachineBasicBlock * BB) const {
    273 
    274   MachineBasicBlock::iterator I = *MI;
    275 
    276   switch (MI->getOpcode()) {
    277   default:
    278     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
    279   case AMDGPU::BRANCH: return BB;
    280   case AMDGPU::SI_ADDR64_RSRC: {
    281     const SIInstrInfo *TII =
    282       static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
    283     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
    284     unsigned SuperReg = MI->getOperand(0).getReg();
    285     unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    286     unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
    287     unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
    288     unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
    289     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), SubRegLo)
    290             .addOperand(MI->getOperand(1));
    291     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiLo)
    292             .addImm(0);
    293     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiHi)
    294             .addImm(RSRC_DATA_FORMAT >> 32);
    295     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SubRegHi)
    296             .addReg(SubRegHiLo)
    297             .addImm(AMDGPU::sub0)
    298             .addReg(SubRegHiHi)
    299             .addImm(AMDGPU::sub1);
    300     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SuperReg)
    301             .addReg(SubRegLo)
    302             .addImm(AMDGPU::sub0_sub1)
    303             .addReg(SubRegHi)
    304             .addImm(AMDGPU::sub2_sub3);
    305     MI->eraseFromParent();
    306     break;
    307   }
    308   case AMDGPU::V_SUB_F64: {
    309     const SIInstrInfo *TII =
    310       static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
    311     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64),
    312             MI->getOperand(0).getReg())
    313             .addReg(MI->getOperand(1).getReg())
    314             .addReg(MI->getOperand(2).getReg())
    315             .addImm(0)  /* src2 */
    316             .addImm(0)  /* ABS */
    317             .addImm(0)  /* CLAMP */
    318             .addImm(0)  /* OMOD */
    319             .addImm(2); /* NEG */
    320     MI->eraseFromParent();
    321     break;
    322   }
    323   }
    324   return BB;
    325 }
    326 
    327 EVT SITargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
    328   if (!VT.isVector()) {
    329     return MVT::i1;
    330   }
    331   return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
    332 }
    333 
    334 MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const {
    335   return MVT::i32;
    336 }
    337 
    338 //===----------------------------------------------------------------------===//
    339 // Custom DAG Lowering Operations
    340 //===----------------------------------------------------------------------===//
    341 
    342 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    343   MachineFunction &MF = DAG.getMachineFunction();
    344   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    345   switch (Op.getOpcode()) {
    346   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
    347   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
    348   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
    349   case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG);
    350   case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG);
    351   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
    352   case ISD::INTRINSIC_WO_CHAIN: {
    353     unsigned IntrinsicID =
    354                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    355     EVT VT = Op.getValueType();
    356     SDLoc DL(Op);
    357     //XXX: Hardcoded we only use two to store the pointer to the parameters.
    358     unsigned NumUserSGPRs = 2;
    359     switch (IntrinsicID) {
    360     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
    361     case Intrinsic::r600_read_ngroups_x:
    362       return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 0);
    363     case Intrinsic::r600_read_ngroups_y:
    364       return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 4);
    365     case Intrinsic::r600_read_ngroups_z:
    366       return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 8);
    367     case Intrinsic::r600_read_global_size_x:
    368       return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 12);
    369     case Intrinsic::r600_read_global_size_y:
    370       return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 16);
    371     case Intrinsic::r600_read_global_size_z:
    372       return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 20);
    373     case Intrinsic::r600_read_local_size_x:
    374       return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 24);
    375     case Intrinsic::r600_read_local_size_y:
    376       return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 28);
    377     case Intrinsic::r600_read_local_size_z:
    378       return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 32);
    379     case Intrinsic::r600_read_tgid_x:
    380       return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
    381                      AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT);
    382     case Intrinsic::r600_read_tgid_y:
    383       return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
    384                      AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT);
    385     case Intrinsic::r600_read_tgid_z:
    386       return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
    387                      AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT);
    388     case Intrinsic::r600_read_tidig_x:
    389       return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
    390                                   AMDGPU::VGPR0, VT);
    391     case Intrinsic::r600_read_tidig_y:
    392       return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
    393                                   AMDGPU::VGPR1, VT);
    394     case Intrinsic::r600_read_tidig_z:
    395       return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
    396                                   AMDGPU::VGPR2, VT);
    397 
    398     }
    399   }
    400   }
    401   return SDValue();
    402 }
    403 
    404 /// \brief Helper function for LowerBRCOND
    405 static SDNode *findUser(SDValue Value, unsigned Opcode) {
    406 
    407   SDNode *Parent = Value.getNode();
    408   for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
    409        I != E; ++I) {
    410 
    411     if (I.getUse().get() != Value)
    412       continue;
    413 
    414     if (I->getOpcode() == Opcode)
    415       return *I;
    416   }
    417   return 0;
    418 }
    419 
    420 /// This transforms the control flow intrinsics to get the branch destination as
    421 /// last parameter, also switches branch target with BR if the need arise
    422 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
    423                                       SelectionDAG &DAG) const {
    424 
    425   SDLoc DL(BRCOND);
    426 
    427   SDNode *Intr = BRCOND.getOperand(1).getNode();
    428   SDValue Target = BRCOND.getOperand(2);
    429   SDNode *BR = 0;
    430 
    431   if (Intr->getOpcode() == ISD::SETCC) {
    432     // As long as we negate the condition everything is fine
    433     SDNode *SetCC = Intr;
    434     assert(SetCC->getConstantOperandVal(1) == 1);
    435     assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
    436            ISD::SETNE);
    437     Intr = SetCC->getOperand(0).getNode();
    438 
    439   } else {
    440     // Get the target from BR if we don't negate the condition
    441     BR = findUser(BRCOND, ISD::BR);
    442     Target = BR->getOperand(1);
    443   }
    444 
    445   assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN);
    446 
    447   // Build the result and
    448   SmallVector<EVT, 4> Res;
    449   for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i)
    450     Res.push_back(Intr->getValueType(i));
    451 
    452   // operands of the new intrinsic call
    453   SmallVector<SDValue, 4> Ops;
    454   Ops.push_back(BRCOND.getOperand(0));
    455   for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i)
    456     Ops.push_back(Intr->getOperand(i));
    457   Ops.push_back(Target);
    458 
    459   // build the new intrinsic call
    460   SDNode *Result = DAG.getNode(
    461     Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
    462     DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode();
    463 
    464   if (BR) {
    465     // Give the branch instruction our target
    466     SDValue Ops[] = {
    467       BR->getOperand(0),
    468       BRCOND.getOperand(2)
    469     };
    470     DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2);
    471   }
    472 
    473   SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
    474 
    475   // Copy the intrinsic results to registers
    476   for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
    477     SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
    478     if (!CopyToReg)
    479       continue;
    480 
    481     Chain = DAG.getCopyToReg(
    482       Chain, DL,
    483       CopyToReg->getOperand(1),
    484       SDValue(Result, i - 1),
    485       SDValue());
    486 
    487     DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
    488   }
    489 
    490   // Remove the old intrinsic from the chain
    491   DAG.ReplaceAllUsesOfValueWith(
    492     SDValue(Intr, Intr->getNumValues() - 1),
    493     Intr->getOperand(0));
    494 
    495   return Chain;
    496 }
    497 
    498 SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
    499   SDValue LHS = Op.getOperand(0);
    500   SDValue RHS = Op.getOperand(1);
    501   SDValue True = Op.getOperand(2);
    502   SDValue False = Op.getOperand(3);
    503   SDValue CC = Op.getOperand(4);
    504   EVT VT = Op.getValueType();
    505   SDLoc DL(Op);
    506 
    507   // Possible Min/Max pattern
    508   SDValue MinMax = LowerMinMax(Op, DAG);
    509   if (MinMax.getNode()) {
    510     return MinMax;
    511   }
    512 
    513   SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC);
    514   return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
    515 }
    516 
    517 SDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op,
    518                                            SelectionDAG &DAG) const {
    519   EVT VT = Op.getValueType();
    520   SDLoc DL(Op);
    521 
    522   if (VT != MVT::i64) {
    523     return SDValue();
    524   }
    525 
    526   SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i32, Op.getOperand(0),
    527                                                  DAG.getConstant(31, MVT::i32));
    528 
    529   return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi);
    530 }
    531 
    532 SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op,
    533                                            SelectionDAG &DAG) const {
    534   EVT VT = Op.getValueType();
    535   SDLoc DL(Op);
    536 
    537   if (VT != MVT::i64) {
    538     return SDValue();
    539   }
    540 
    541   return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0),
    542                                               DAG.getConstant(0, MVT::i32));
    543 }
    544 
    545 //===----------------------------------------------------------------------===//
    546 // Custom DAG optimizations
    547 //===----------------------------------------------------------------------===//
    548 
    549 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
    550                                             DAGCombinerInfo &DCI) const {
    551   SelectionDAG &DAG = DCI.DAG;
    552   SDLoc DL(N);
    553   EVT VT = N->getValueType(0);
    554 
    555   switch (N->getOpcode()) {
    556     default: break;
    557     case ISD::SELECT_CC: {
    558       N->dump();
    559       ConstantSDNode *True, *False;
    560       // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc)
    561       if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2)))
    562           && (False = dyn_cast<ConstantSDNode>(N->getOperand(3)))
    563           && True->isAllOnesValue()
    564           && False->isNullValue()
    565           && VT == MVT::i1) {
    566         return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0),
    567                            N->getOperand(1), N->getOperand(4));
    568 
    569       }
    570       break;
    571     }
    572     case ISD::SETCC: {
    573       SDValue Arg0 = N->getOperand(0);
    574       SDValue Arg1 = N->getOperand(1);
    575       SDValue CC = N->getOperand(2);
    576       ConstantSDNode * C = NULL;
    577       ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get();
    578 
    579       // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne)
    580       if (VT == MVT::i1
    581           && Arg0.getOpcode() == ISD::SIGN_EXTEND
    582           && Arg0.getOperand(0).getValueType() == MVT::i1
    583           && (C = dyn_cast<ConstantSDNode>(Arg1))
    584           && C->isNullValue()
    585           && CCOp == ISD::SETNE) {
    586         return SimplifySetCC(VT, Arg0.getOperand(0),
    587                              DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL);
    588       }
    589       break;
    590     }
    591   }
    592   return SDValue();
    593 }
    594 
    595 /// \brief Test if RegClass is one of the VSrc classes
    596 static bool isVSrc(unsigned RegClass) {
    597   return AMDGPU::VSrc_32RegClassID == RegClass ||
    598          AMDGPU::VSrc_64RegClassID == RegClass;
    599 }
    600 
    601 /// \brief Test if RegClass is one of the SSrc classes
    602 static bool isSSrc(unsigned RegClass) {
    603   return AMDGPU::SSrc_32RegClassID == RegClass ||
    604          AMDGPU::SSrc_64RegClassID == RegClass;
    605 }
    606 
    607 /// \brief Analyze the possible immediate value Op
    608 ///
    609 /// Returns -1 if it isn't an immediate, 0 if it's and inline immediate
    610 /// and the immediate value if it's a literal immediate
    611 int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
    612 
    613   union {
    614     int32_t I;
    615     float F;
    616   } Imm;
    617 
    618   if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) {
    619     if (Node->getZExtValue() >> 32) {
    620         return -1;
    621     }
    622     Imm.I = Node->getSExtValue();
    623   } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N))
    624     Imm.F = Node->getValueAPF().convertToFloat();
    625   else
    626     return -1; // It isn't an immediate
    627 
    628   if ((Imm.I >= -16 && Imm.I <= 64) ||
    629       Imm.F == 0.5f || Imm.F == -0.5f ||
    630       Imm.F == 1.0f || Imm.F == -1.0f ||
    631       Imm.F == 2.0f || Imm.F == -2.0f ||
    632       Imm.F == 4.0f || Imm.F == -4.0f)
    633     return 0; // It's an inline immediate
    634 
    635   return Imm.I; // It's a literal immediate
    636 }
    637 
    638 /// \brief Try to fold an immediate directly into an instruction
    639 bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate,
    640                                bool &ScalarSlotUsed) const {
    641 
    642   MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand);
    643   const SIInstrInfo *TII =
    644     static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
    645   if (Mov == 0 || !TII->isMov(Mov->getMachineOpcode()))
    646     return false;
    647 
    648   const SDValue &Op = Mov->getOperand(0);
    649   int32_t Value = analyzeImmediate(Op.getNode());
    650   if (Value == -1) {
    651     // Not an immediate at all
    652     return false;
    653 
    654   } else if (Value == 0) {
    655     // Inline immediates can always be fold
    656     Operand = Op;
    657     return true;
    658 
    659   } else if (Value == Immediate) {
    660     // Already fold literal immediate
    661     Operand = Op;
    662     return true;
    663 
    664   } else if (!ScalarSlotUsed && !Immediate) {
    665     // Fold this literal immediate
    666     ScalarSlotUsed = true;
    667     Immediate = Value;
    668     Operand = Op;
    669     return true;
    670 
    671   }
    672 
    673   return false;
    674 }
    675 
    676 const TargetRegisterClass *SITargetLowering::getRegClassForNode(
    677                                    SelectionDAG &DAG, const SDValue &Op) const {
    678   const SIInstrInfo *TII =
    679     static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
    680   const SIRegisterInfo &TRI = TII->getRegisterInfo();
    681 
    682   if (!Op->isMachineOpcode()) {
    683     switch(Op->getOpcode()) {
    684     case ISD::CopyFromReg: {
    685       MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
    686       unsigned Reg = cast<RegisterSDNode>(Op->getOperand(1))->getReg();
    687       if (TargetRegisterInfo::isVirtualRegister(Reg)) {
    688         return MRI.getRegClass(Reg);
    689       }
    690       return TRI.getPhysRegClass(Reg);
    691     }
    692     default:  return NULL;
    693     }
    694   }
    695   const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode());
    696   int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass;
    697   if (OpClassID != -1) {
    698     return TRI.getRegClass(OpClassID);
    699   }
    700   switch(Op.getMachineOpcode()) {
    701   case AMDGPU::COPY_TO_REGCLASS:
    702     // Operand 1 is the register class id for COPY_TO_REGCLASS instructions.
    703     OpClassID = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue();
    704 
    705     // If the COPY_TO_REGCLASS instruction is copying to a VSrc register
    706     // class, then the register class for the value could be either a
    707     // VReg or and SReg.  In order to get a more accurate
    708     if (OpClassID == AMDGPU::VSrc_32RegClassID ||
    709         OpClassID == AMDGPU::VSrc_64RegClassID) {
    710       return getRegClassForNode(DAG, Op.getOperand(0));
    711     }
    712     return TRI.getRegClass(OpClassID);
    713   case AMDGPU::EXTRACT_SUBREG: {
    714     int SubIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    715     const TargetRegisterClass *SuperClass =
    716       getRegClassForNode(DAG, Op.getOperand(0));
    717     return TRI.getSubClassWithSubReg(SuperClass, SubIdx);
    718   }
    719   case AMDGPU::REG_SEQUENCE:
    720     // Operand 0 is the register class id for REG_SEQUENCE instructions.
    721     return TRI.getRegClass(
    722       cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue());
    723   default:
    724     return getRegClassFor(Op.getSimpleValueType());
    725   }
    726 }
    727 
    728 /// \brief Does "Op" fit into register class "RegClass" ?
    729 bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op,
    730                                     unsigned RegClass) const {
    731   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
    732   const TargetRegisterClass *RC = getRegClassForNode(DAG, Op);
    733   if (!RC) {
    734     return false;
    735   }
    736   return TRI->getRegClass(RegClass)->hasSubClassEq(RC);
    737 }
    738 
    739 /// \brief Make sure that we don't exeed the number of allowed scalars
    740 void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand,
    741                                        unsigned RegClass,
    742                                        bool &ScalarSlotUsed) const {
    743 
    744   // First map the operands register class to a destination class
    745   if (RegClass == AMDGPU::VSrc_32RegClassID)
    746     RegClass = AMDGPU::VReg_32RegClassID;
    747   else if (RegClass == AMDGPU::VSrc_64RegClassID)
    748     RegClass = AMDGPU::VReg_64RegClassID;
    749   else
    750     return;
    751 
    752   // Nothing todo if they fit naturaly
    753   if (fitsRegClass(DAG, Operand, RegClass))
    754     return;
    755 
    756   // If the scalar slot isn't used yet use it now
    757   if (!ScalarSlotUsed) {
    758     ScalarSlotUsed = true;
    759     return;
    760   }
    761 
    762   // This is a conservative aproach, it is possible that we can't determine
    763   // the correct register class and copy too often, but better save than sorry.
    764   SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32);
    765   SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(),
    766                                     Operand.getValueType(), Operand, RC);
    767   Operand = SDValue(Node, 0);
    768 }
    769 
    770 /// \returns true if \p Node's operands are different from the SDValue list
    771 /// \p Ops
    772 static bool isNodeChanged(const SDNode *Node, const std::vector<SDValue> &Ops) {
    773   for (unsigned i = 0, e = Node->getNumOperands(); i < e; ++i) {
    774     if (Ops[i].getNode() != Node->getOperand(i).getNode()) {
    775       return true;
    776     }
    777   }
    778   return false;
    779 }
    780 
    781 /// \brief Try to fold the Nodes operands into the Node
    782 SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
    783                                        SelectionDAG &DAG) const {
    784 
    785   // Original encoding (either e32 or e64)
    786   int Opcode = Node->getMachineOpcode();
    787   const SIInstrInfo *TII =
    788     static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
    789   const MCInstrDesc *Desc = &TII->get(Opcode);
    790 
    791   unsigned NumDefs = Desc->getNumDefs();
    792   unsigned NumOps = Desc->getNumOperands();
    793 
    794   // Commuted opcode if available
    795   int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1;
    796   const MCInstrDesc *DescRev = OpcodeRev == -1 ? 0 : &TII->get(OpcodeRev);
    797 
    798   assert(!DescRev || DescRev->getNumDefs() == NumDefs);
    799   assert(!DescRev || DescRev->getNumOperands() == NumOps);
    800 
    801   // e64 version if available, -1 otherwise
    802   int OpcodeE64 = AMDGPU::getVOPe64(Opcode);
    803   const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? 0 : &TII->get(OpcodeE64);
    804 
    805   assert(!DescE64 || DescE64->getNumDefs() == NumDefs);
    806   assert(!DescE64 || DescE64->getNumOperands() == (NumOps + 4));
    807 
    808   int32_t Immediate = Desc->getSize() == 4 ? 0 : -1;
    809   bool HaveVSrc = false, HaveSSrc = false;
    810 
    811   // First figure out what we alread have in this instruction
    812   for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs;
    813        i != e && Op < NumOps; ++i, ++Op) {
    814 
    815     unsigned RegClass = Desc->OpInfo[Op].RegClass;
    816     if (isVSrc(RegClass))
    817       HaveVSrc = true;
    818     else if (isSSrc(RegClass))
    819       HaveSSrc = true;
    820     else
    821       continue;
    822 
    823     int32_t Imm = analyzeImmediate(Node->getOperand(i).getNode());
    824     if (Imm != -1 && Imm != 0) {
    825       // Literal immediate
    826       Immediate = Imm;
    827     }
    828   }
    829 
    830   // If we neither have VSrc nor SSrc it makes no sense to continue
    831   if (!HaveVSrc && !HaveSSrc)
    832     return Node;
    833 
    834   // No scalar allowed when we have both VSrc and SSrc
    835   bool ScalarSlotUsed = HaveVSrc && HaveSSrc;
    836 
    837   // Second go over the operands and try to fold them
    838   std::vector<SDValue> Ops;
    839   bool Promote2e64 = false;
    840   for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs;
    841        i != e && Op < NumOps; ++i, ++Op) {
    842 
    843     const SDValue &Operand = Node->getOperand(i);
    844     Ops.push_back(Operand);
    845 
    846     // Already folded immediate ?
    847     if (isa<ConstantSDNode>(Operand.getNode()) ||
    848         isa<ConstantFPSDNode>(Operand.getNode()))
    849       continue;
    850 
    851     // Is this a VSrc or SSrc operand ?
    852     unsigned RegClass = Desc->OpInfo[Op].RegClass;
    853     if (isVSrc(RegClass) || isSSrc(RegClass)) {
    854       // Try to fold the immediates
    855       if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) {
    856         // Folding didn't worked, make sure we don't hit the SReg limit
    857         ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed);
    858       }
    859       continue;
    860     }
    861 
    862     if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) {
    863 
    864       unsigned OtherRegClass = Desc->OpInfo[NumDefs].RegClass;
    865       assert(isVSrc(OtherRegClass) || isSSrc(OtherRegClass));
    866 
    867       // Test if it makes sense to swap operands
    868       if (foldImm(Ops[1], Immediate, ScalarSlotUsed) ||
    869           (!fitsRegClass(DAG, Ops[1], RegClass) &&
    870            fitsRegClass(DAG, Ops[1], OtherRegClass))) {
    871 
    872         // Swap commutable operands
    873         SDValue Tmp = Ops[1];
    874         Ops[1] = Ops[0];
    875         Ops[0] = Tmp;
    876 
    877         Desc = DescRev;
    878         DescRev = 0;
    879         continue;
    880       }
    881     }
    882 
    883     if (DescE64 && !Immediate) {
    884 
    885       // Test if it makes sense to switch to e64 encoding
    886       unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass;
    887       if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass))
    888         continue;
    889 
    890       int32_t TmpImm = -1;
    891       if (foldImm(Ops[i], TmpImm, ScalarSlotUsed) ||
    892           (!fitsRegClass(DAG, Ops[i], RegClass) &&
    893            fitsRegClass(DAG, Ops[1], OtherRegClass))) {
    894 
    895         // Switch to e64 encoding
    896         Immediate = -1;
    897         Promote2e64 = true;
    898         Desc = DescE64;
    899         DescE64 = 0;
    900       }
    901     }
    902   }
    903 
    904   if (Promote2e64) {
    905     // Add the modifier flags while promoting
    906     for (unsigned i = 0; i < 4; ++i)
    907       Ops.push_back(DAG.getTargetConstant(0, MVT::i32));
    908   }
    909 
    910   // Add optional chain and glue
    911   for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i)
    912     Ops.push_back(Node->getOperand(i));
    913 
    914   // Nodes that have a glue result are not CSE'd by getMachineNode(), so in
    915   // this case a brand new node is always be created, even if the operands
    916   // are the same as before.  So, manually check if anything has been changed.
    917   if (Desc->Opcode == Opcode && !isNodeChanged(Node, Ops)) {
    918     return Node;
    919   }
    920 
    921   // Create a complete new instruction
    922   return DAG.getMachineNode(Desc->Opcode, SDLoc(Node), Node->getVTList(), Ops);
    923 }
    924 
    925 /// \brief Helper function for adjustWritemask
    926 static unsigned SubIdx2Lane(unsigned Idx) {
    927   switch (Idx) {
    928   default: return 0;
    929   case AMDGPU::sub0: return 0;
    930   case AMDGPU::sub1: return 1;
    931   case AMDGPU::sub2: return 2;
    932   case AMDGPU::sub3: return 3;
    933   }
    934 }
    935 
    936 /// \brief Adjust the writemask of MIMG instructions
    937 void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
    938                                        SelectionDAG &DAG) const {
    939   SDNode *Users[4] = { };
    940   unsigned Writemask = 0, Lane = 0;
    941 
    942   // Try to figure out the used register components
    943   for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
    944        I != E; ++I) {
    945 
    946     // Abort if we can't understand the usage
    947     if (!I->isMachineOpcode() ||
    948         I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
    949       return;
    950 
    951     Lane = SubIdx2Lane(I->getConstantOperandVal(1));
    952 
    953     // Abort if we have more than one user per component
    954     if (Users[Lane])
    955       return;
    956 
    957     Users[Lane] = *I;
    958     Writemask |= 1 << Lane;
    959   }
    960 
    961   // Abort if all components are used
    962   if (Writemask == 0xf)
    963     return;
    964 
    965   // Adjust the writemask in the node
    966   std::vector<SDValue> Ops;
    967   Ops.push_back(DAG.getTargetConstant(Writemask, MVT::i32));
    968   for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i)
    969     Ops.push_back(Node->getOperand(i));
    970   Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size());
    971 
    972   // If we only got one lane, replace it with a copy
    973   if (Writemask == (1U << Lane)) {
    974     SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32);
    975     SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
    976                                       SDLoc(), Users[Lane]->getValueType(0),
    977                                       SDValue(Node, 0), RC);
    978     DAG.ReplaceAllUsesWith(Users[Lane], Copy);
    979     return;
    980   }
    981 
    982   // Update the users of the node with the new indices
    983   for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
    984 
    985     SDNode *User = Users[i];
    986     if (!User)
    987       continue;
    988 
    989     SDValue Op = DAG.getTargetConstant(Idx, MVT::i32);
    990     DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
    991 
    992     switch (Idx) {
    993     default: break;
    994     case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
    995     case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
    996     case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
    997     }
    998   }
    999 }
   1000 
   1001 /// \brief Fold the instructions after slecting them
   1002 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
   1003                                           SelectionDAG &DAG) const {
   1004   Node = AdjustRegClass(Node, DAG);
   1005 
   1006   if (AMDGPU::isMIMG(Node->getMachineOpcode()) != -1)
   1007     adjustWritemask(Node, DAG);
   1008 
   1009   return foldOperands(Node, DAG);
   1010 }
   1011 
   1012 /// \brief Assign the register class depending on the number of
   1013 /// bits set in the writemask
   1014 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
   1015                                                      SDNode *Node) const {
   1016   if (AMDGPU::isMIMG(MI->getOpcode()) == -1)
   1017     return;
   1018 
   1019   unsigned VReg = MI->getOperand(0).getReg();
   1020   unsigned Writemask = MI->getOperand(1).getImm();
   1021   unsigned BitsSet = 0;
   1022   for (unsigned i = 0; i < 4; ++i)
   1023     BitsSet += Writemask & (1 << i) ? 1 : 0;
   1024 
   1025   const TargetRegisterClass *RC;
   1026   switch (BitsSet) {
   1027   default: return;
   1028   case 1:  RC = &AMDGPU::VReg_32RegClass; break;
   1029   case 2:  RC = &AMDGPU::VReg_64RegClass; break;
   1030   case 3:  RC = &AMDGPU::VReg_96RegClass; break;
   1031   }
   1032 
   1033   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
   1034   MRI.setRegClass(VReg, RC);
   1035 }
   1036 
   1037 MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N,
   1038                                                 SelectionDAG &DAG) const {
   1039 
   1040   SDLoc DL(N);
   1041   unsigned NewOpcode = N->getMachineOpcode();
   1042 
   1043   switch (N->getMachineOpcode()) {
   1044   default: return N;
   1045   case AMDGPU::S_LOAD_DWORD_IMM:
   1046     NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
   1047     // Fall-through
   1048   case AMDGPU::S_LOAD_DWORDX2_SGPR:
   1049     if (NewOpcode == N->getMachineOpcode()) {
   1050       NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
   1051     }
   1052     // Fall-through
   1053   case AMDGPU::S_LOAD_DWORDX4_IMM:
   1054   case AMDGPU::S_LOAD_DWORDX4_SGPR: {
   1055     if (NewOpcode == N->getMachineOpcode()) {
   1056       NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
   1057     }
   1058     if (fitsRegClass(DAG, N->getOperand(0), AMDGPU::SReg_64RegClassID)) {
   1059       return N;
   1060     }
   1061     ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1));
   1062     SDValue Ops[] = {
   1063       SDValue(DAG.getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::i128,
   1064                                  DAG.getConstant(0, MVT::i64)), 0),
   1065       N->getOperand(0),
   1066       DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32)
   1067     };
   1068     return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops);
   1069   }
   1070   }
   1071 }
   1072 
   1073 SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
   1074                                                const TargetRegisterClass *RC,
   1075                                                unsigned Reg, EVT VT) const {
   1076   SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT);
   1077 
   1078   return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()),
   1079                             cast<RegisterSDNode>(VReg)->getReg(), VT);
   1080 }
   1081