Home | History | Annotate | Download | only in R600
      1 //===-- SIInstrInfo.cpp - SI Instruction Information  ---------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file
     11 /// \brief SI Implementation of TargetInstrInfo.
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 
     16 #include "SIInstrInfo.h"
     17 #include "AMDGPUTargetMachine.h"
     18 #include "SIDefines.h"
     19 #include "SIMachineFunctionInfo.h"
     20 #include "llvm/CodeGen/MachineFrameInfo.h"
     21 #include "llvm/CodeGen/MachineInstrBuilder.h"
     22 #include "llvm/CodeGen/MachineRegisterInfo.h"
     23 #include "llvm/IR/Function.h"
     24 #include "llvm/CodeGen/RegisterScavenging.h"
     25 #include "llvm/MC/MCInstrDesc.h"
     26 #include "llvm/Support/Debug.h"
     27 
     28 using namespace llvm;
     29 
     30 SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st)
     31     : AMDGPUInstrInfo(st), RI() {}
     32 
     33 //===----------------------------------------------------------------------===//
     34 // TargetInstrInfo callbacks
     35 //===----------------------------------------------------------------------===//
     36 
     37 static unsigned getNumOperandsNoGlue(SDNode *Node) {
     38   unsigned N = Node->getNumOperands();
     39   while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
     40     --N;
     41   return N;
     42 }
     43 
     44 static SDValue findChainOperand(SDNode *Load) {
     45   SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
     46   assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
     47   return LastOp;
     48 }
     49 
     50 /// \brief Returns true if both nodes have the same value for the given
     51 ///        operand \p Op, or if both nodes do not have this operand.
     52 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
     53   unsigned Opc0 = N0->getMachineOpcode();
     54   unsigned Opc1 = N1->getMachineOpcode();
     55 
     56   int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
     57   int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
     58 
     59   if (Op0Idx == -1 && Op1Idx == -1)
     60     return true;
     61 
     62 
     63   if ((Op0Idx == -1 && Op1Idx != -1) ||
     64       (Op1Idx == -1 && Op0Idx != -1))
     65     return false;
     66 
     67   // getNamedOperandIdx returns the index for the MachineInstr's operands,
     68   // which includes the result as the first operand. We are indexing into the
     69   // MachineSDNode's operands, so we need to skip the result operand to get
     70   // the real index.
     71   --Op0Idx;
     72   --Op1Idx;
     73 
     74   return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
     75 }
     76 
     77 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
     78                                           int64_t &Offset0,
     79                                           int64_t &Offset1) const {
     80   if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
     81     return false;
     82 
     83   unsigned Opc0 = Load0->getMachineOpcode();
     84   unsigned Opc1 = Load1->getMachineOpcode();
     85 
     86   // Make sure both are actually loads.
     87   if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
     88     return false;
     89 
     90   if (isDS(Opc0) && isDS(Opc1)) {
     91 
     92     // FIXME: Handle this case:
     93     if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
     94       return false;
     95 
     96     // Check base reg.
     97     if (Load0->getOperand(1) != Load1->getOperand(1))
     98       return false;
     99 
    100     // Check chain.
    101     if (findChainOperand(Load0) != findChainOperand(Load1))
    102       return false;
    103 
    104     // Skip read2 / write2 variants for simplicity.
    105     // TODO: We should report true if the used offsets are adjacent (excluded
    106     // st64 versions).
    107     if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
    108         AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
    109       return false;
    110 
    111     Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
    112     Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
    113     return true;
    114   }
    115 
    116   if (isSMRD(Opc0) && isSMRD(Opc1)) {
    117     assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
    118 
    119     // Check base reg.
    120     if (Load0->getOperand(0) != Load1->getOperand(0))
    121       return false;
    122 
    123     const ConstantSDNode *Load0Offset =
    124         dyn_cast<ConstantSDNode>(Load0->getOperand(1));
    125     const ConstantSDNode *Load1Offset =
    126         dyn_cast<ConstantSDNode>(Load1->getOperand(1));
    127 
    128     if (!Load0Offset || !Load1Offset)
    129       return false;
    130 
    131     // Check chain.
    132     if (findChainOperand(Load0) != findChainOperand(Load1))
    133       return false;
    134 
    135     Offset0 = Load0Offset->getZExtValue();
    136     Offset1 = Load1Offset->getZExtValue();
    137     return true;
    138   }
    139 
    140   // MUBUF and MTBUF can access the same addresses.
    141   if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
    142 
    143     // MUBUF and MTBUF have vaddr at different indices.
    144     if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
    145         findChainOperand(Load0) != findChainOperand(Load1) ||
    146         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
    147         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
    148       return false;
    149 
    150     int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
    151     int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
    152 
    153     if (OffIdx0 == -1 || OffIdx1 == -1)
    154       return false;
    155 
    156     // getNamedOperandIdx returns the index for MachineInstrs.  Since they
    157     // inlcude the output in the operand list, but SDNodes don't, we need to
    158     // subtract the index by one.
    159     --OffIdx0;
    160     --OffIdx1;
    161 
    162     SDValue Off0 = Load0->getOperand(OffIdx0);
    163     SDValue Off1 = Load1->getOperand(OffIdx1);
    164 
    165     // The offset might be a FrameIndexSDNode.
    166     if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
    167       return false;
    168 
    169     Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
    170     Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
    171     return true;
    172   }
    173 
    174   return false;
    175 }
    176 
    177 static bool isStride64(unsigned Opc) {
    178   switch (Opc) {
    179   case AMDGPU::DS_READ2ST64_B32:
    180   case AMDGPU::DS_READ2ST64_B64:
    181   case AMDGPU::DS_WRITE2ST64_B32:
    182   case AMDGPU::DS_WRITE2ST64_B64:
    183     return true;
    184   default:
    185     return false;
    186   }
    187 }
    188 
    189 bool SIInstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt,
    190                                        unsigned &BaseReg, unsigned &Offset,
    191                                        const TargetRegisterInfo *TRI) const {
    192   unsigned Opc = LdSt->getOpcode();
    193   if (isDS(Opc)) {
    194     const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
    195                                                       AMDGPU::OpName::offset);
    196     if (OffsetImm) {
    197       // Normal, single offset LDS instruction.
    198       const MachineOperand *AddrReg = getNamedOperand(*LdSt,
    199                                                       AMDGPU::OpName::addr);
    200 
    201       BaseReg = AddrReg->getReg();
    202       Offset = OffsetImm->getImm();
    203       return true;
    204     }
    205 
    206     // The 2 offset instructions use offset0 and offset1 instead. We can treat
    207     // these as a load with a single offset if the 2 offsets are consecutive. We
    208     // will use this for some partially aligned loads.
    209     const MachineOperand *Offset0Imm = getNamedOperand(*LdSt,
    210                                                        AMDGPU::OpName::offset0);
    211     const MachineOperand *Offset1Imm = getNamedOperand(*LdSt,
    212                                                        AMDGPU::OpName::offset1);
    213 
    214     uint8_t Offset0 = Offset0Imm->getImm();
    215     uint8_t Offset1 = Offset1Imm->getImm();
    216     assert(Offset1 > Offset0);
    217 
    218     if (Offset1 - Offset0 == 1) {
    219       // Each of these offsets is in element sized units, so we need to convert
    220       // to bytes of the individual reads.
    221 
    222       unsigned EltSize;
    223       if (LdSt->mayLoad())
    224         EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2;
    225       else {
    226         assert(LdSt->mayStore());
    227         int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
    228         EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize();
    229       }
    230 
    231       if (isStride64(Opc))
    232         EltSize *= 64;
    233 
    234       const MachineOperand *AddrReg = getNamedOperand(*LdSt,
    235                                                       AMDGPU::OpName::addr);
    236       BaseReg = AddrReg->getReg();
    237       Offset = EltSize * Offset0;
    238       return true;
    239     }
    240 
    241     return false;
    242   }
    243 
    244   if (isMUBUF(Opc) || isMTBUF(Opc)) {
    245     if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1)
    246       return false;
    247 
    248     const MachineOperand *AddrReg = getNamedOperand(*LdSt,
    249                                                     AMDGPU::OpName::vaddr);
    250     if (!AddrReg)
    251       return false;
    252 
    253     const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
    254                                                       AMDGPU::OpName::offset);
    255     BaseReg = AddrReg->getReg();
    256     Offset = OffsetImm->getImm();
    257     return true;
    258   }
    259 
    260   if (isSMRD(Opc)) {
    261     const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
    262                                                       AMDGPU::OpName::offset);
    263     if (!OffsetImm)
    264       return false;
    265 
    266     const MachineOperand *SBaseReg = getNamedOperand(*LdSt,
    267                                                      AMDGPU::OpName::sbase);
    268     BaseReg = SBaseReg->getReg();
    269     Offset = OffsetImm->getImm();
    270     return true;
    271   }
    272 
    273   return false;
    274 }
    275 
    276 bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
    277                                      MachineInstr *SecondLdSt,
    278                                      unsigned NumLoads) const {
    279   unsigned Opc0 = FirstLdSt->getOpcode();
    280   unsigned Opc1 = SecondLdSt->getOpcode();
    281 
    282   // TODO: This needs finer tuning
    283   if (NumLoads > 4)
    284     return false;
    285 
    286   if (isDS(Opc0) && isDS(Opc1))
    287     return true;
    288 
    289   if (isSMRD(Opc0) && isSMRD(Opc1))
    290     return true;
    291 
    292   if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1)))
    293     return true;
    294 
    295   return false;
    296 }
    297 
    298 void
    299 SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
    300                          MachineBasicBlock::iterator MI, DebugLoc DL,
    301                          unsigned DestReg, unsigned SrcReg,
    302                          bool KillSrc) const {
    303 
    304   // If we are trying to copy to or from SCC, there is a bug somewhere else in
    305   // the backend.  While it may be theoretically possible to do this, it should
    306   // never be necessary.
    307   assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC);
    308 
    309   static const int16_t Sub0_15[] = {
    310     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
    311     AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
    312     AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
    313     AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0
    314   };
    315 
    316   static const int16_t Sub0_7[] = {
    317     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
    318     AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0
    319   };
    320 
    321   static const int16_t Sub0_3[] = {
    322     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0
    323   };
    324 
    325   static const int16_t Sub0_2[] = {
    326     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0
    327   };
    328 
    329   static const int16_t Sub0_1[] = {
    330     AMDGPU::sub0, AMDGPU::sub1, 0
    331   };
    332 
    333   unsigned Opcode;
    334   const int16_t *SubIndices;
    335 
    336   if (AMDGPU::SReg_32RegClass.contains(DestReg)) {
    337     assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
    338     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
    339             .addReg(SrcReg, getKillRegState(KillSrc));
    340     return;
    341 
    342   } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) {
    343     if (DestReg == AMDGPU::VCC) {
    344       if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
    345         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
    346           .addReg(SrcReg, getKillRegState(KillSrc));
    347       } else {
    348         // FIXME: Hack until VReg_1 removed.
    349         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
    350         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32), AMDGPU::VCC)
    351           .addImm(0)
    352           .addReg(SrcReg, getKillRegState(KillSrc));
    353       }
    354 
    355       return;
    356     }
    357 
    358     assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
    359     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
    360             .addReg(SrcReg, getKillRegState(KillSrc));
    361     return;
    362 
    363   } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) {
    364     assert(AMDGPU::SReg_128RegClass.contains(SrcReg));
    365     Opcode = AMDGPU::S_MOV_B32;
    366     SubIndices = Sub0_3;
    367 
    368   } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) {
    369     assert(AMDGPU::SReg_256RegClass.contains(SrcReg));
    370     Opcode = AMDGPU::S_MOV_B32;
    371     SubIndices = Sub0_7;
    372 
    373   } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) {
    374     assert(AMDGPU::SReg_512RegClass.contains(SrcReg));
    375     Opcode = AMDGPU::S_MOV_B32;
    376     SubIndices = Sub0_15;
    377 
    378   } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) {
    379     assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
    380            AMDGPU::SReg_32RegClass.contains(SrcReg));
    381     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
    382             .addReg(SrcReg, getKillRegState(KillSrc));
    383     return;
    384 
    385   } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) {
    386     assert(AMDGPU::VReg_64RegClass.contains(SrcReg) ||
    387            AMDGPU::SReg_64RegClass.contains(SrcReg));
    388     Opcode = AMDGPU::V_MOV_B32_e32;
    389     SubIndices = Sub0_1;
    390 
    391   } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) {
    392     assert(AMDGPU::VReg_96RegClass.contains(SrcReg));
    393     Opcode = AMDGPU::V_MOV_B32_e32;
    394     SubIndices = Sub0_2;
    395 
    396   } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) {
    397     assert(AMDGPU::VReg_128RegClass.contains(SrcReg) ||
    398            AMDGPU::SReg_128RegClass.contains(SrcReg));
    399     Opcode = AMDGPU::V_MOV_B32_e32;
    400     SubIndices = Sub0_3;
    401 
    402   } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) {
    403     assert(AMDGPU::VReg_256RegClass.contains(SrcReg) ||
    404            AMDGPU::SReg_256RegClass.contains(SrcReg));
    405     Opcode = AMDGPU::V_MOV_B32_e32;
    406     SubIndices = Sub0_7;
    407 
    408   } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) {
    409     assert(AMDGPU::VReg_512RegClass.contains(SrcReg) ||
    410            AMDGPU::SReg_512RegClass.contains(SrcReg));
    411     Opcode = AMDGPU::V_MOV_B32_e32;
    412     SubIndices = Sub0_15;
    413 
    414   } else {
    415     llvm_unreachable("Can't copy register!");
    416   }
    417 
    418   while (unsigned SubIdx = *SubIndices++) {
    419     MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
    420       get(Opcode), RI.getSubReg(DestReg, SubIdx));
    421 
    422     Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc));
    423 
    424     if (*SubIndices)
    425       Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
    426   }
    427 }
    428 
    429 unsigned SIInstrInfo::commuteOpcode(const MachineInstr &MI) const {
    430   const unsigned Opcode = MI.getOpcode();
    431 
    432   int NewOpc;
    433 
    434   // Try to map original to commuted opcode
    435   NewOpc = AMDGPU::getCommuteRev(Opcode);
    436   // Check if the commuted (REV) opcode exists on the target.
    437   if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1)
    438     return NewOpc;
    439 
    440   // Try to map commuted to original opcode
    441   NewOpc = AMDGPU::getCommuteOrig(Opcode);
    442   // Check if the original (non-REV) opcode exists on the target.
    443   if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1)
    444     return NewOpc;
    445 
    446   return Opcode;
    447 }
    448 
    449 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
    450 
    451   if (DstRC->getSize() == 4) {
    452     return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
    453   } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) {
    454     return AMDGPU::S_MOV_B64;
    455   } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) {
    456     return  AMDGPU::V_MOV_B64_PSEUDO;
    457   }
    458   return AMDGPU::COPY;
    459 }
    460 
    461 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
    462                                       MachineBasicBlock::iterator MI,
    463                                       unsigned SrcReg, bool isKill,
    464                                       int FrameIndex,
    465                                       const TargetRegisterClass *RC,
    466                                       const TargetRegisterInfo *TRI) const {
    467   MachineFunction *MF = MBB.getParent();
    468   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
    469   MachineFrameInfo *FrameInfo = MF->getFrameInfo();
    470   DebugLoc DL = MBB.findDebugLoc(MI);
    471   int Opcode = -1;
    472 
    473   if (RI.isSGPRClass(RC)) {
    474     // We are only allowed to create one new instruction when spilling
    475     // registers, so we need to use pseudo instruction for spilling
    476     // SGPRs.
    477     switch (RC->getSize() * 8) {
    478       case 32:  Opcode = AMDGPU::SI_SPILL_S32_SAVE;  break;
    479       case 64:  Opcode = AMDGPU::SI_SPILL_S64_SAVE;  break;
    480       case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break;
    481       case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break;
    482       case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break;
    483     }
    484   } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) {
    485     MFI->setHasSpilledVGPRs();
    486 
    487     switch(RC->getSize() * 8) {
    488       case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break;
    489       case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break;
    490       case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break;
    491       case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break;
    492       case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break;
    493       case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break;
    494     }
    495   }
    496 
    497   if (Opcode != -1) {
    498     FrameInfo->setObjectAlignment(FrameIndex, 4);
    499     BuildMI(MBB, MI, DL, get(Opcode))
    500             .addReg(SrcReg)
    501             .addFrameIndex(FrameIndex)
    502             // Place-holder registers, these will be filled in by
    503             // SIPrepareScratchRegs.
    504             .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
    505             .addReg(AMDGPU::SGPR0, RegState::Undef);
    506   } else {
    507     LLVMContext &Ctx = MF->getFunction()->getContext();
    508     Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
    509                   " spill register");
    510     BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
    511             .addReg(SrcReg);
    512   }
    513 }
    514 
    515 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
    516                                        MachineBasicBlock::iterator MI,
    517                                        unsigned DestReg, int FrameIndex,
    518                                        const TargetRegisterClass *RC,
    519                                        const TargetRegisterInfo *TRI) const {
    520   MachineFunction *MF = MBB.getParent();
    521   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
    522   MachineFrameInfo *FrameInfo = MF->getFrameInfo();
    523   DebugLoc DL = MBB.findDebugLoc(MI);
    524   int Opcode = -1;
    525 
    526   if (RI.isSGPRClass(RC)){
    527     switch(RC->getSize() * 8) {
    528       case 32:  Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break;
    529       case 64:  Opcode = AMDGPU::SI_SPILL_S64_RESTORE;  break;
    530       case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break;
    531       case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
    532       case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break;
    533     }
    534   } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) {
    535     switch(RC->getSize() * 8) {
    536       case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break;
    537       case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break;
    538       case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break;
    539       case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break;
    540       case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break;
    541       case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break;
    542     }
    543   }
    544 
    545   if (Opcode != -1) {
    546     FrameInfo->setObjectAlignment(FrameIndex, 4);
    547     BuildMI(MBB, MI, DL, get(Opcode), DestReg)
    548             .addFrameIndex(FrameIndex)
    549             // Place-holder registers, these will be filled in by
    550             // SIPrepareScratchRegs.
    551             .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
    552             .addReg(AMDGPU::SGPR0, RegState::Undef);
    553 
    554   } else {
    555     LLVMContext &Ctx = MF->getFunction()->getContext();
    556     Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
    557                   " restore register");
    558     BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
    559   }
    560 }
    561 
    562 /// \param @Offset Offset in bytes of the FrameIndex being spilled
    563 unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
    564                                                MachineBasicBlock::iterator MI,
    565                                                RegScavenger *RS, unsigned TmpReg,
    566                                                unsigned FrameOffset,
    567                                                unsigned Size) const {
    568   MachineFunction *MF = MBB.getParent();
    569   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
    570   const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>();
    571   const SIRegisterInfo *TRI =
    572       static_cast<const SIRegisterInfo*>(ST.getRegisterInfo());
    573   DebugLoc DL = MBB.findDebugLoc(MI);
    574   unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF);
    575   unsigned WavefrontSize = ST.getWavefrontSize();
    576 
    577   unsigned TIDReg = MFI->getTIDReg();
    578   if (!MFI->hasCalculatedTID()) {
    579     MachineBasicBlock &Entry = MBB.getParent()->front();
    580     MachineBasicBlock::iterator Insert = Entry.front();
    581     DebugLoc DL = Insert->getDebugLoc();
    582 
    583     TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass);
    584     if (TIDReg == AMDGPU::NoRegister)
    585       return TIDReg;
    586 
    587 
    588     if (MFI->getShaderType() == ShaderType::COMPUTE &&
    589         WorkGroupSize > WavefrontSize) {
    590 
    591       unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X);
    592       unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y);
    593       unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z);
    594       unsigned InputPtrReg =
    595           TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR);
    596       for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
    597         if (!Entry.isLiveIn(Reg))
    598           Entry.addLiveIn(Reg);
    599       }
    600 
    601       RS->enterBasicBlock(&Entry);
    602       unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
    603       unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
    604       BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
    605               .addReg(InputPtrReg)
    606               .addImm(SI::KernelInputOffsets::NGROUPS_Z);
    607       BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
    608               .addReg(InputPtrReg)
    609               .addImm(SI::KernelInputOffsets::NGROUPS_Y);
    610 
    611       // NGROUPS.X * NGROUPS.Y
    612       BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
    613               .addReg(STmp1)
    614               .addReg(STmp0);
    615       // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
    616       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
    617               .addReg(STmp1)
    618               .addReg(TIDIGXReg);
    619       // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
    620       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
    621               .addReg(STmp0)
    622               .addReg(TIDIGYReg)
    623               .addReg(TIDReg);
    624       // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
    625       BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg)
    626               .addReg(TIDReg)
    627               .addReg(TIDIGZReg);
    628     } else {
    629       // Get the wave id
    630       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
    631               TIDReg)
    632               .addImm(-1)
    633               .addImm(0);
    634 
    635       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
    636               TIDReg)
    637               .addImm(-1)
    638               .addReg(TIDReg);
    639     }
    640 
    641     BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
    642             TIDReg)
    643             .addImm(2)
    644             .addReg(TIDReg);
    645     MFI->setTIDReg(TIDReg);
    646   }
    647 
    648   // Add FrameIndex to LDS offset
    649   unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize);
    650   BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg)
    651           .addImm(LDSOffset)
    652           .addReg(TIDReg);
    653 
    654   return TmpReg;
    655 }
    656 
    657 void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI,
    658                              int Count) const {
    659   while (Count > 0) {
    660     int Arg;
    661     if (Count >= 8)
    662       Arg = 7;
    663     else
    664       Arg = Count - 1;
    665     Count -= 8;
    666     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP))
    667             .addImm(Arg);
    668   }
    669 }
    670 
    671 bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
    672   MachineBasicBlock &MBB = *MI->getParent();
    673   DebugLoc DL = MBB.findDebugLoc(MI);
    674   switch (MI->getOpcode()) {
    675   default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
    676 
    677   case AMDGPU::SI_CONSTDATA_PTR: {
    678     unsigned Reg = MI->getOperand(0).getReg();
    679     unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
    680     unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
    681 
    682     BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg);
    683 
    684     // Add 32-bit offset from this instruction to the start of the constant data.
    685     BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo)
    686             .addReg(RegLo)
    687             .addTargetIndex(AMDGPU::TI_CONSTDATA_START)
    688             .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit);
    689     BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi)
    690             .addReg(RegHi)
    691             .addImm(0)
    692             .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit)
    693             .addReg(AMDGPU::SCC, RegState::Implicit);
    694     MI->eraseFromParent();
    695     break;
    696   }
    697   case AMDGPU::SGPR_USE:
    698     // This is just a placeholder for register allocation.
    699     MI->eraseFromParent();
    700     break;
    701 
    702   case AMDGPU::V_MOV_B64_PSEUDO: {
    703     unsigned Dst = MI->getOperand(0).getReg();
    704     unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
    705     unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
    706 
    707     const MachineOperand &SrcOp = MI->getOperand(1);
    708     // FIXME: Will this work for 64-bit floating point immediates?
    709     assert(!SrcOp.isFPImm());
    710     if (SrcOp.isImm()) {
    711       APInt Imm(64, SrcOp.getImm());
    712       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
    713               .addImm(Imm.getLoBits(32).getZExtValue())
    714               .addReg(Dst, RegState::Implicit);
    715       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
    716               .addImm(Imm.getHiBits(32).getZExtValue())
    717               .addReg(Dst, RegState::Implicit);
    718     } else {
    719       assert(SrcOp.isReg());
    720       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
    721               .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
    722               .addReg(Dst, RegState::Implicit);
    723       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
    724               .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
    725               .addReg(Dst, RegState::Implicit);
    726     }
    727     MI->eraseFromParent();
    728     break;
    729   }
    730 
    731   case AMDGPU::V_CNDMASK_B64_PSEUDO: {
    732     unsigned Dst = MI->getOperand(0).getReg();
    733     unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
    734     unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
    735     unsigned Src0 = MI->getOperand(1).getReg();
    736     unsigned Src1 = MI->getOperand(2).getReg();
    737     const MachineOperand &SrcCond = MI->getOperand(3);
    738 
    739     BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
    740         .addReg(RI.getSubReg(Src0, AMDGPU::sub0))
    741         .addReg(RI.getSubReg(Src1, AMDGPU::sub0))
    742         .addOperand(SrcCond);
    743     BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
    744         .addReg(RI.getSubReg(Src0, AMDGPU::sub1))
    745         .addReg(RI.getSubReg(Src1, AMDGPU::sub1))
    746         .addOperand(SrcCond);
    747     MI->eraseFromParent();
    748     break;
    749   }
    750   }
    751   return true;
    752 }
    753 
    754 MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
    755                                               bool NewMI) const {
    756 
    757   if (MI->getNumOperands() < 3)
    758     return nullptr;
    759 
    760   int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
    761                                            AMDGPU::OpName::src0);
    762   assert(Src0Idx != -1 && "Should always have src0 operand");
    763 
    764   MachineOperand &Src0 = MI->getOperand(Src0Idx);
    765   if (!Src0.isReg())
    766     return nullptr;
    767 
    768   int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
    769                                            AMDGPU::OpName::src1);
    770   if (Src1Idx == -1)
    771     return nullptr;
    772 
    773   MachineOperand &Src1 = MI->getOperand(Src1Idx);
    774 
    775   // Make sure it's legal to commute operands for VOP2.
    776   if (isVOP2(MI->getOpcode()) &&
    777       (!isOperandLegal(MI, Src0Idx, &Src1) ||
    778        !isOperandLegal(MI, Src1Idx, &Src0))) {
    779     return nullptr;
    780   }
    781 
    782   if (!Src1.isReg()) {
    783     // Allow commuting instructions with Imm operands.
    784     if (NewMI || !Src1.isImm() ||
    785        (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) {
    786       return nullptr;
    787     }
    788 
    789     // Be sure to copy the source modifiers to the right place.
    790     if (MachineOperand *Src0Mods
    791           = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
    792       MachineOperand *Src1Mods
    793         = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers);
    794 
    795       int Src0ModsVal = Src0Mods->getImm();
    796       if (!Src1Mods && Src0ModsVal != 0)
    797         return nullptr;
    798 
    799       // XXX - This assert might be a lie. It might be useful to have a neg
    800       // modifier with 0.0.
    801       int Src1ModsVal = Src1Mods->getImm();
    802       assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates");
    803 
    804       Src1Mods->setImm(Src0ModsVal);
    805       Src0Mods->setImm(Src1ModsVal);
    806     }
    807 
    808     unsigned Reg = Src0.getReg();
    809     unsigned SubReg = Src0.getSubReg();
    810     if (Src1.isImm())
    811       Src0.ChangeToImmediate(Src1.getImm());
    812     else
    813       llvm_unreachable("Should only have immediates");
    814 
    815     Src1.ChangeToRegister(Reg, false);
    816     Src1.setSubReg(SubReg);
    817   } else {
    818     MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
    819   }
    820 
    821   if (MI)
    822     MI->setDesc(get(commuteOpcode(*MI)));
    823 
    824   return MI;
    825 }
    826 
    827 // This needs to be implemented because the source modifiers may be inserted
    828 // between the true commutable operands, and the base
    829 // TargetInstrInfo::commuteInstruction uses it.
    830 bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
    831                                         unsigned &SrcOpIdx1,
    832                                         unsigned &SrcOpIdx2) const {
    833   const MCInstrDesc &MCID = MI->getDesc();
    834   if (!MCID.isCommutable())
    835     return false;
    836 
    837   unsigned Opc = MI->getOpcode();
    838   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
    839   if (Src0Idx == -1)
    840     return false;
    841 
    842   // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on
    843   // immediate.
    844   if (!MI->getOperand(Src0Idx).isReg())
    845     return false;
    846 
    847   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
    848   if (Src1Idx == -1)
    849     return false;
    850 
    851   if (!MI->getOperand(Src1Idx).isReg())
    852     return false;
    853 
    854   // If any source modifiers are set, the generic instruction commuting won't
    855   // understand how to copy the source modifiers.
    856   if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) ||
    857       hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers))
    858     return false;
    859 
    860   SrcOpIdx1 = Src0Idx;
    861   SrcOpIdx2 = Src1Idx;
    862   return true;
    863 }
    864 
    865 MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB,
    866                                          MachineBasicBlock::iterator I,
    867                                          unsigned DstReg,
    868                                          unsigned SrcReg) const {
    869   return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32),
    870                  DstReg) .addReg(SrcReg);
    871 }
    872 
    873 bool SIInstrInfo::isMov(unsigned Opcode) const {
    874   switch(Opcode) {
    875   default: return false;
    876   case AMDGPU::S_MOV_B32:
    877   case AMDGPU::S_MOV_B64:
    878   case AMDGPU::V_MOV_B32_e32:
    879   case AMDGPU::V_MOV_B32_e64:
    880     return true;
    881   }
    882 }
    883 
    884 bool
    885 SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
    886   return RC != &AMDGPU::EXECRegRegClass;
    887 }
    888 
    889 static void removeModOperands(MachineInstr &MI) {
    890   unsigned Opc = MI.getOpcode();
    891   int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
    892                                               AMDGPU::OpName::src0_modifiers);
    893   int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
    894                                               AMDGPU::OpName::src1_modifiers);
    895   int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
    896                                               AMDGPU::OpName::src2_modifiers);
    897 
    898   MI.RemoveOperand(Src2ModIdx);
    899   MI.RemoveOperand(Src1ModIdx);
    900   MI.RemoveOperand(Src0ModIdx);
    901 }
    902 
    903 bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
    904                                 unsigned Reg, MachineRegisterInfo *MRI) const {
    905   if (!MRI->hasOneNonDBGUse(Reg))
    906     return false;
    907 
    908   unsigned Opc = UseMI->getOpcode();
    909   if (Opc == AMDGPU::V_MAD_F32) {
    910     // Don't fold if we are using source modifiers. The new VOP2 instructions
    911     // don't have them.
    912     if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) ||
    913         hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) ||
    914         hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) {
    915       return false;
    916     }
    917 
    918     MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0);
    919     MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1);
    920     MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2);
    921 
    922     // Multiplied part is the constant: Use v_madmk_f32
    923     // We should only expect these to be on src0 due to canonicalizations.
    924     if (Src0->isReg() && Src0->getReg() == Reg) {
    925       if (!Src1->isReg() ||
    926           (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
    927         return false;
    928 
    929       if (!Src2->isReg() ||
    930           (Src2->isReg() && RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))))
    931         return false;
    932 
    933       // We need to do some weird looking operand shuffling since the madmk
    934       // operands are out of the normal expected order with the multiplied
    935       // constant as the last operand.
    936       //
    937       // v_mad_f32 src0, src1, src2 -> v_madmk_f32 src0 * src2K + src1
    938       // src0 -> src2 K
    939       // src1 -> src0
    940       // src2 -> src1
    941 
    942       const int64_t Imm = DefMI->getOperand(1).getImm();
    943 
    944       // FIXME: This would be a lot easier if we could return a new instruction
    945       // instead of having to modify in place.
    946 
    947       // Remove these first since they are at the end.
    948       UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
    949                                                       AMDGPU::OpName::omod));
    950       UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
    951                                                       AMDGPU::OpName::clamp));
    952 
    953       unsigned Src1Reg = Src1->getReg();
    954       unsigned Src1SubReg = Src1->getSubReg();
    955       unsigned Src2Reg = Src2->getReg();
    956       unsigned Src2SubReg = Src2->getSubReg();
    957       Src0->setReg(Src1Reg);
    958       Src0->setSubReg(Src1SubReg);
    959       Src1->setReg(Src2Reg);
    960       Src1->setSubReg(Src2SubReg);
    961 
    962       Src2->ChangeToImmediate(Imm);
    963 
    964       removeModOperands(*UseMI);
    965       UseMI->setDesc(get(AMDGPU::V_MADMK_F32));
    966 
    967       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
    968       if (DeleteDef)
    969         DefMI->eraseFromParent();
    970 
    971       return true;
    972     }
    973 
    974     // Added part is the constant: Use v_madak_f32
    975     if (Src2->isReg() && Src2->getReg() == Reg) {
    976       // Not allowed to use constant bus for another operand.
    977       // We can however allow an inline immediate as src0.
    978       if (!Src0->isImm() &&
    979           (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
    980         return false;
    981 
    982       if (!Src1->isReg() ||
    983           (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
    984         return false;
    985 
    986       const int64_t Imm = DefMI->getOperand(1).getImm();
    987 
    988       // FIXME: This would be a lot easier if we could return a new instruction
    989       // instead of having to modify in place.
    990 
    991       // Remove these first since they are at the end.
    992       UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
    993                                                       AMDGPU::OpName::omod));
    994       UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
    995                                                       AMDGPU::OpName::clamp));
    996 
    997       Src2->ChangeToImmediate(Imm);
    998 
    999       // These come before src2.
   1000       removeModOperands(*UseMI);
   1001       UseMI->setDesc(get(AMDGPU::V_MADAK_F32));
   1002 
   1003       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
   1004       if (DeleteDef)
   1005         DefMI->eraseFromParent();
   1006 
   1007       return true;
   1008     }
   1009   }
   1010 
   1011   return false;
   1012 }
   1013 
   1014 bool
   1015 SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI,
   1016                                          AliasAnalysis *AA) const {
   1017   switch(MI->getOpcode()) {
   1018   default: return AMDGPUInstrInfo::isTriviallyReMaterializable(MI, AA);
   1019   case AMDGPU::S_MOV_B32:
   1020   case AMDGPU::S_MOV_B64:
   1021   case AMDGPU::V_MOV_B32_e32:
   1022     return MI->getOperand(1).isImm();
   1023   }
   1024 }
   1025 
   1026 static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
   1027                                 int WidthB, int OffsetB) {
   1028   int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
   1029   int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
   1030   int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
   1031   return LowOffset + LowWidth <= HighOffset;
   1032 }
   1033 
   1034 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
   1035                                                MachineInstr *MIb) const {
   1036   unsigned BaseReg0, Offset0;
   1037   unsigned BaseReg1, Offset1;
   1038 
   1039   if (getLdStBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
   1040       getLdStBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
   1041     assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() &&
   1042            "read2 / write2 not expected here yet");
   1043     unsigned Width0 = (*MIa->memoperands_begin())->getSize();
   1044     unsigned Width1 = (*MIb->memoperands_begin())->getSize();
   1045     if (BaseReg0 == BaseReg1 &&
   1046         offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
   1047       return true;
   1048     }
   1049   }
   1050 
   1051   return false;
   1052 }
   1053 
   1054 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
   1055                                                   MachineInstr *MIb,
   1056                                                   AliasAnalysis *AA) const {
   1057   unsigned Opc0 = MIa->getOpcode();
   1058   unsigned Opc1 = MIb->getOpcode();
   1059 
   1060   assert(MIa && (MIa->mayLoad() || MIa->mayStore()) &&
   1061          "MIa must load from or modify a memory location");
   1062   assert(MIb && (MIb->mayLoad() || MIb->mayStore()) &&
   1063          "MIb must load from or modify a memory location");
   1064 
   1065   if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects())
   1066     return false;
   1067 
   1068   // XXX - Can we relax this between address spaces?
   1069   if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef())
   1070     return false;
   1071 
   1072   // TODO: Should we check the address space from the MachineMemOperand? That
   1073   // would allow us to distinguish objects we know don't alias based on the
   1074   // underlying addres space, even if it was lowered to a different one,
   1075   // e.g. private accesses lowered to use MUBUF instructions on a scratch
   1076   // buffer.
   1077   if (isDS(Opc0)) {
   1078     if (isDS(Opc1))
   1079       return checkInstOffsetsDoNotOverlap(MIa, MIb);
   1080 
   1081     return !isFLAT(Opc1);
   1082   }
   1083 
   1084   if (isMUBUF(Opc0) || isMTBUF(Opc0)) {
   1085     if (isMUBUF(Opc1) || isMTBUF(Opc1))
   1086       return checkInstOffsetsDoNotOverlap(MIa, MIb);
   1087 
   1088     return !isFLAT(Opc1) && !isSMRD(Opc1);
   1089   }
   1090 
   1091   if (isSMRD(Opc0)) {
   1092     if (isSMRD(Opc1))
   1093       return checkInstOffsetsDoNotOverlap(MIa, MIb);
   1094 
   1095     return !isFLAT(Opc1) && !isMUBUF(Opc0) && !isMTBUF(Opc0);
   1096   }
   1097 
   1098   if (isFLAT(Opc0)) {
   1099     if (isFLAT(Opc1))
   1100       return checkInstOffsetsDoNotOverlap(MIa, MIb);
   1101 
   1102     return false;
   1103   }
   1104 
   1105   return false;
   1106 }
   1107 
   1108 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
   1109   int64_t SVal = Imm.getSExtValue();
   1110   if (SVal >= -16 && SVal <= 64)
   1111     return true;
   1112 
   1113   if (Imm.getBitWidth() == 64) {
   1114     uint64_t Val = Imm.getZExtValue();
   1115     return (DoubleToBits(0.0) == Val) ||
   1116            (DoubleToBits(1.0) == Val) ||
   1117            (DoubleToBits(-1.0) == Val) ||
   1118            (DoubleToBits(0.5) == Val) ||
   1119            (DoubleToBits(-0.5) == Val) ||
   1120            (DoubleToBits(2.0) == Val) ||
   1121            (DoubleToBits(-2.0) == Val) ||
   1122            (DoubleToBits(4.0) == Val) ||
   1123            (DoubleToBits(-4.0) == Val);
   1124   }
   1125 
   1126   // The actual type of the operand does not seem to matter as long
   1127   // as the bits match one of the inline immediate values.  For example:
   1128   //
   1129   // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal,
   1130   // so it is a legal inline immediate.
   1131   //
   1132   // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in
   1133   // floating-point, so it is a legal inline immediate.
   1134   uint32_t Val = Imm.getZExtValue();
   1135 
   1136   return (FloatToBits(0.0f) == Val) ||
   1137          (FloatToBits(1.0f) == Val) ||
   1138          (FloatToBits(-1.0f) == Val) ||
   1139          (FloatToBits(0.5f) == Val) ||
   1140          (FloatToBits(-0.5f) == Val) ||
   1141          (FloatToBits(2.0f) == Val) ||
   1142          (FloatToBits(-2.0f) == Val) ||
   1143          (FloatToBits(4.0f) == Val) ||
   1144          (FloatToBits(-4.0f) == Val);
   1145 }
   1146 
   1147 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
   1148                                    unsigned OpSize) const {
   1149   if (MO.isImm()) {
   1150     // MachineOperand provides no way to tell the true operand size, since it
   1151     // only records a 64-bit value. We need to know the size to determine if a
   1152     // 32-bit floating point immediate bit pattern is legal for an integer
   1153     // immediate. It would be for any 32-bit integer operand, but would not be
   1154     // for a 64-bit one.
   1155 
   1156     unsigned BitSize = 8 * OpSize;
   1157     return isInlineConstant(APInt(BitSize, MO.getImm(), true));
   1158   }
   1159 
   1160   return false;
   1161 }
   1162 
   1163 bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO,
   1164                                     unsigned OpSize) const {
   1165   return MO.isImm() && !isInlineConstant(MO, OpSize);
   1166 }
   1167 
   1168 static bool compareMachineOp(const MachineOperand &Op0,
   1169                              const MachineOperand &Op1) {
   1170   if (Op0.getType() != Op1.getType())
   1171     return false;
   1172 
   1173   switch (Op0.getType()) {
   1174   case MachineOperand::MO_Register:
   1175     return Op0.getReg() == Op1.getReg();
   1176   case MachineOperand::MO_Immediate:
   1177     return Op0.getImm() == Op1.getImm();
   1178   default:
   1179     llvm_unreachable("Didn't expect to be comparing these operand types");
   1180   }
   1181 }
   1182 
   1183 bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
   1184                                  const MachineOperand &MO) const {
   1185   const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo];
   1186 
   1187   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
   1188 
   1189   if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
   1190     return true;
   1191 
   1192   if (OpInfo.RegClass < 0)
   1193     return false;
   1194 
   1195   unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize();
   1196   if (isLiteralConstant(MO, OpSize))
   1197     return RI.opCanUseLiteralConstant(OpInfo.OperandType);
   1198 
   1199   return RI.opCanUseInlineConstant(OpInfo.OperandType);
   1200 }
   1201 
   1202 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
   1203   int Op32 = AMDGPU::getVOPe32(Opcode);
   1204   if (Op32 == -1)
   1205     return false;
   1206 
   1207   return pseudoToMCOpcode(Op32) != -1;
   1208 }
   1209 
   1210 bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
   1211   // The src0_modifier operand is present on all instructions
   1212   // that have modifiers.
   1213 
   1214   return AMDGPU::getNamedOperandIdx(Opcode,
   1215                                     AMDGPU::OpName::src0_modifiers) != -1;
   1216 }
   1217 
   1218 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
   1219                                   unsigned OpName) const {
   1220   const MachineOperand *Mods = getNamedOperand(MI, OpName);
   1221   return Mods && Mods->getImm();
   1222 }
   1223 
   1224 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
   1225                                   const MachineOperand &MO,
   1226                                   unsigned OpSize) const {
   1227   // Literal constants use the constant bus.
   1228   if (isLiteralConstant(MO, OpSize))
   1229     return true;
   1230 
   1231   if (!MO.isReg() || !MO.isUse())
   1232     return false;
   1233 
   1234   if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
   1235     return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
   1236 
   1237   // FLAT_SCR is just an SGPR pair.
   1238   if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
   1239     return true;
   1240 
   1241   // EXEC register uses the constant bus.
   1242   if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
   1243     return true;
   1244 
   1245   // SGPRs use the constant bus
   1246   if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
   1247       (!MO.isImplicit() &&
   1248       (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
   1249        AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) {
   1250     return true;
   1251   }
   1252 
   1253   return false;
   1254 }
   1255 
   1256 bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
   1257                                     StringRef &ErrInfo) const {
   1258   uint16_t Opcode = MI->getOpcode();
   1259   const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
   1260   int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
   1261   int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
   1262   int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
   1263 
   1264   // Make sure the number of operands is correct.
   1265   const MCInstrDesc &Desc = get(Opcode);
   1266   if (!Desc.isVariadic() &&
   1267       Desc.getNumOperands() != MI->getNumExplicitOperands()) {
   1268      ErrInfo = "Instruction has wrong number of operands.";
   1269      return false;
   1270   }
   1271 
   1272   // Make sure the register classes are correct
   1273   for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
   1274     if (MI->getOperand(i).isFPImm()) {
   1275       ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
   1276                 "all fp values to integers.";
   1277       return false;
   1278     }
   1279 
   1280     int RegClass = Desc.OpInfo[i].RegClass;
   1281 
   1282     switch (Desc.OpInfo[i].OperandType) {
   1283     case MCOI::OPERAND_REGISTER:
   1284       if (MI->getOperand(i).isImm()) {
   1285         ErrInfo = "Illegal immediate value for operand.";
   1286         return false;
   1287       }
   1288       break;
   1289     case AMDGPU::OPERAND_REG_IMM32:
   1290       break;
   1291     case AMDGPU::OPERAND_REG_INLINE_C:
   1292       if (isLiteralConstant(MI->getOperand(i),
   1293                             RI.getRegClass(RegClass)->getSize())) {
   1294         ErrInfo = "Illegal immediate value for operand.";
   1295         return false;
   1296       }
   1297       break;
   1298     case MCOI::OPERAND_IMMEDIATE:
   1299       // Check if this operand is an immediate.
   1300       // FrameIndex operands will be replaced by immediates, so they are
   1301       // allowed.
   1302       if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) {
   1303         ErrInfo = "Expected immediate, but got non-immediate";
   1304         return false;
   1305       }
   1306       // Fall-through
   1307     default:
   1308       continue;
   1309     }
   1310 
   1311     if (!MI->getOperand(i).isReg())
   1312       continue;
   1313 
   1314     if (RegClass != -1) {
   1315       unsigned Reg = MI->getOperand(i).getReg();
   1316       if (TargetRegisterInfo::isVirtualRegister(Reg))
   1317         continue;
   1318 
   1319       const TargetRegisterClass *RC = RI.getRegClass(RegClass);
   1320       if (!RC->contains(Reg)) {
   1321         ErrInfo = "Operand has incorrect register class.";
   1322         return false;
   1323       }
   1324     }
   1325   }
   1326 
   1327 
   1328   // Verify VOP*
   1329   if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) {
   1330     // Only look at the true operands. Only a real operand can use the constant
   1331     // bus, and we don't want to check pseudo-operands like the source modifier
   1332     // flags.
   1333     const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
   1334 
   1335     unsigned ConstantBusCount = 0;
   1336     unsigned SGPRUsed = AMDGPU::NoRegister;
   1337     for (int OpIdx : OpIndices) {
   1338       if (OpIdx == -1)
   1339         break;
   1340       const MachineOperand &MO = MI->getOperand(OpIdx);
   1341       if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) {
   1342         if (MO.isReg()) {
   1343           if (MO.getReg() != SGPRUsed)
   1344             ++ConstantBusCount;
   1345           SGPRUsed = MO.getReg();
   1346         } else {
   1347           ++ConstantBusCount;
   1348         }
   1349       }
   1350     }
   1351     if (ConstantBusCount > 1) {
   1352       ErrInfo = "VOP* instruction uses the constant bus more than once";
   1353       return false;
   1354     }
   1355   }
   1356 
   1357   // Verify misc. restrictions on specific instructions.
   1358   if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
   1359       Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
   1360     const MachineOperand &Src0 = MI->getOperand(Src0Idx);
   1361     const MachineOperand &Src1 = MI->getOperand(Src1Idx);
   1362     const MachineOperand &Src2 = MI->getOperand(Src2Idx);
   1363     if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
   1364       if (!compareMachineOp(Src0, Src1) &&
   1365           !compareMachineOp(Src0, Src2)) {
   1366         ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
   1367         return false;
   1368       }
   1369     }
   1370   }
   1371 
   1372   return true;
   1373 }
   1374 
   1375 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
   1376   switch (MI.getOpcode()) {
   1377   default: return AMDGPU::INSTRUCTION_LIST_END;
   1378   case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
   1379   case AMDGPU::COPY: return AMDGPU::COPY;
   1380   case AMDGPU::PHI: return AMDGPU::PHI;
   1381   case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
   1382   case AMDGPU::S_MOV_B32:
   1383     return MI.getOperand(1).isReg() ?
   1384            AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
   1385   case AMDGPU::S_ADD_I32:
   1386   case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32;
   1387   case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
   1388   case AMDGPU::S_SUB_I32:
   1389   case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32;
   1390   case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
   1391   case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
   1392   case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32;
   1393   case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32;
   1394   case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32;
   1395   case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32;
   1396   case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32;
   1397   case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32;
   1398   case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32;
   1399   case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
   1400   case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
   1401   case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
   1402   case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
   1403   case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
   1404   case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
   1405   case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
   1406   case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
   1407   case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
   1408   case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
   1409   case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
   1410   case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
   1411   case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
   1412   case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
   1413   case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
   1414   case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
   1415   case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
   1416   case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
   1417   case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
   1418   case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
   1419   case AMDGPU::S_LOAD_DWORD_IMM:
   1420   case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
   1421   case AMDGPU::S_LOAD_DWORDX2_IMM:
   1422   case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
   1423   case AMDGPU::S_LOAD_DWORDX4_IMM:
   1424   case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
   1425   case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
   1426   case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
   1427   case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
   1428   case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
   1429   }
   1430 }
   1431 
   1432 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const {
   1433   return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END;
   1434 }
   1435 
   1436 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
   1437                                                       unsigned OpNo) const {
   1438   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   1439   const MCInstrDesc &Desc = get(MI.getOpcode());
   1440   if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
   1441       Desc.OpInfo[OpNo].RegClass == -1) {
   1442     unsigned Reg = MI.getOperand(OpNo).getReg();
   1443 
   1444     if (TargetRegisterInfo::isVirtualRegister(Reg))
   1445       return MRI.getRegClass(Reg);
   1446     return RI.getPhysRegClass(Reg);
   1447   }
   1448 
   1449   unsigned RCID = Desc.OpInfo[OpNo].RegClass;
   1450   return RI.getRegClass(RCID);
   1451 }
   1452 
   1453 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
   1454   switch (MI.getOpcode()) {
   1455   case AMDGPU::COPY:
   1456   case AMDGPU::REG_SEQUENCE:
   1457   case AMDGPU::PHI:
   1458   case AMDGPU::INSERT_SUBREG:
   1459     return RI.hasVGPRs(getOpRegClass(MI, 0));
   1460   default:
   1461     return RI.hasVGPRs(getOpRegClass(MI, OpNo));
   1462   }
   1463 }
   1464 
   1465 void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const {
   1466   MachineBasicBlock::iterator I = MI;
   1467   MachineBasicBlock *MBB = MI->getParent();
   1468   MachineOperand &MO = MI->getOperand(OpIdx);
   1469   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   1470   unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass;
   1471   const TargetRegisterClass *RC = RI.getRegClass(RCID);
   1472   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
   1473   if (MO.isReg())
   1474     Opcode = AMDGPU::COPY;
   1475   else if (RI.isSGPRClass(RC))
   1476     Opcode = AMDGPU::S_MOV_B32;
   1477 
   1478 
   1479   const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
   1480   if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
   1481     VRC = &AMDGPU::VReg_64RegClass;
   1482   else
   1483     VRC = &AMDGPU::VGPR_32RegClass;
   1484 
   1485   unsigned Reg = MRI.createVirtualRegister(VRC);
   1486   DebugLoc DL = MBB->findDebugLoc(I);
   1487   BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg)
   1488     .addOperand(MO);
   1489   MO.ChangeToRegister(Reg, false);
   1490 }
   1491 
   1492 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
   1493                                          MachineRegisterInfo &MRI,
   1494                                          MachineOperand &SuperReg,
   1495                                          const TargetRegisterClass *SuperRC,
   1496                                          unsigned SubIdx,
   1497                                          const TargetRegisterClass *SubRC)
   1498                                          const {
   1499   assert(SuperReg.isReg());
   1500 
   1501   unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
   1502   unsigned SubReg = MRI.createVirtualRegister(SubRC);
   1503 
   1504   // Just in case the super register is itself a sub-register, copy it to a new
   1505   // value so we don't need to worry about merging its subreg index with the
   1506   // SubIdx passed to this function. The register coalescer should be able to
   1507   // eliminate this extra copy.
   1508   MachineBasicBlock *MBB = MI->getParent();
   1509   DebugLoc DL = MI->getDebugLoc();
   1510 
   1511   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
   1512     .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
   1513 
   1514   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
   1515     .addReg(NewSuperReg, 0, SubIdx);
   1516 
   1517   return SubReg;
   1518 }
   1519 
   1520 MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
   1521   MachineBasicBlock::iterator MII,
   1522   MachineRegisterInfo &MRI,
   1523   MachineOperand &Op,
   1524   const TargetRegisterClass *SuperRC,
   1525   unsigned SubIdx,
   1526   const TargetRegisterClass *SubRC) const {
   1527   if (Op.isImm()) {
   1528     // XXX - Is there a better way to do this?
   1529     if (SubIdx == AMDGPU::sub0)
   1530       return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF);
   1531     if (SubIdx == AMDGPU::sub1)
   1532       return MachineOperand::CreateImm(Op.getImm() >> 32);
   1533 
   1534     llvm_unreachable("Unhandled register index for immediate");
   1535   }
   1536 
   1537   unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
   1538                                        SubIdx, SubRC);
   1539   return MachineOperand::CreateReg(SubReg, false);
   1540 }
   1541 
   1542 unsigned SIInstrInfo::split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist,
   1543                                     MachineBasicBlock::iterator MI,
   1544                                     MachineRegisterInfo &MRI,
   1545                                     const TargetRegisterClass *RC,
   1546                                     const MachineOperand &Op) const {
   1547   MachineBasicBlock *MBB = MI->getParent();
   1548   DebugLoc DL = MI->getDebugLoc();
   1549   unsigned LoDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
   1550   unsigned HiDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
   1551   unsigned Dst = MRI.createVirtualRegister(RC);
   1552 
   1553   MachineInstr *Lo = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32),
   1554                              LoDst)
   1555     .addImm(Op.getImm() & 0xFFFFFFFF);
   1556   MachineInstr *Hi = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32),
   1557                              HiDst)
   1558     .addImm(Op.getImm() >> 32);
   1559 
   1560   BuildMI(*MBB, MI, DL, get(TargetOpcode::REG_SEQUENCE), Dst)
   1561     .addReg(LoDst)
   1562     .addImm(AMDGPU::sub0)
   1563     .addReg(HiDst)
   1564     .addImm(AMDGPU::sub1);
   1565 
   1566   Worklist.push_back(Lo);
   1567   Worklist.push_back(Hi);
   1568 
   1569   return Dst;
   1570 }
   1571 
   1572 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
   1573 void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const {
   1574   assert(Inst->getNumExplicitOperands() == 3);
   1575   MachineOperand Op1 = Inst->getOperand(1);
   1576   Inst->RemoveOperand(1);
   1577   Inst->addOperand(Op1);
   1578 }
   1579 
   1580 bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
   1581                                  const MachineOperand *MO) const {
   1582   const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
   1583   const MCInstrDesc &InstDesc = get(MI->getOpcode());
   1584   const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
   1585   const TargetRegisterClass *DefinedRC =
   1586       OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
   1587   if (!MO)
   1588     MO = &MI->getOperand(OpIdx);
   1589 
   1590   if (isVALU(InstDesc.Opcode) &&
   1591       usesConstantBus(MRI, *MO, DefinedRC->getSize())) {
   1592     unsigned SGPRUsed =
   1593         MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister;
   1594     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
   1595       if (i == OpIdx)
   1596         continue;
   1597       const MachineOperand &Op = MI->getOperand(i);
   1598       if (Op.isReg() && Op.getReg() != SGPRUsed &&
   1599           usesConstantBus(MRI, Op, getOpSize(*MI, i))) {
   1600         return false;
   1601       }
   1602     }
   1603   }
   1604 
   1605   if (MO->isReg()) {
   1606     assert(DefinedRC);
   1607     const TargetRegisterClass *RC = MRI.getRegClass(MO->getReg());
   1608 
   1609     // In order to be legal, the common sub-class must be equal to the
   1610     // class of the current operand.  For example:
   1611     //
   1612     // v_mov_b32 s0 ; Operand defined as vsrc_32
   1613     //              ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL
   1614     //
   1615     // s_sendmsg 0, s0 ; Operand defined as m0reg
   1616     //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
   1617 
   1618     return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
   1619   }
   1620 
   1621 
   1622   // Handle non-register types that are treated like immediates.
   1623   assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
   1624 
   1625   if (!DefinedRC) {
   1626     // This operand expects an immediate.
   1627     return true;
   1628   }
   1629 
   1630   return isImmOperandLegal(MI, OpIdx, *MO);
   1631 }
   1632 
   1633 void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
   1634   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
   1635 
   1636   int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
   1637                                            AMDGPU::OpName::src0);
   1638   int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
   1639                                            AMDGPU::OpName::src1);
   1640   int Src2Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
   1641                                            AMDGPU::OpName::src2);
   1642 
   1643   // Legalize VOP2
   1644   if (isVOP2(MI->getOpcode()) && Src1Idx != -1) {
   1645     // Legalize src0
   1646     if (!isOperandLegal(MI, Src0Idx))
   1647       legalizeOpWithMove(MI, Src0Idx);
   1648 
   1649     // Legalize src1
   1650     if (isOperandLegal(MI, Src1Idx))
   1651       return;
   1652 
   1653     // Usually src0 of VOP2 instructions allow more types of inputs
   1654     // than src1, so try to commute the instruction to decrease our
   1655     // chances of having to insert a MOV instruction to legalize src1.
   1656     if (MI->isCommutable()) {
   1657       if (commuteInstruction(MI))
   1658         // If we are successful in commuting, then we know MI is legal, so
   1659         // we are done.
   1660         return;
   1661     }
   1662 
   1663     legalizeOpWithMove(MI, Src1Idx);
   1664     return;
   1665   }
   1666 
   1667   // XXX - Do any VOP3 instructions read VCC?
   1668   // Legalize VOP3
   1669   if (isVOP3(MI->getOpcode())) {
   1670     int VOP3Idx[3] = { Src0Idx, Src1Idx, Src2Idx };
   1671 
   1672     // Find the one SGPR operand we are allowed to use.
   1673     unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
   1674 
   1675     for (unsigned i = 0; i < 3; ++i) {
   1676       int Idx = VOP3Idx[i];
   1677       if (Idx == -1)
   1678         break;
   1679       MachineOperand &MO = MI->getOperand(Idx);
   1680 
   1681       if (MO.isReg()) {
   1682         if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
   1683           continue; // VGPRs are legal
   1684 
   1685         assert(MO.getReg() != AMDGPU::SCC && "SCC operand to VOP3 instruction");
   1686 
   1687         if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
   1688           SGPRReg = MO.getReg();
   1689           // We can use one SGPR in each VOP3 instruction.
   1690           continue;
   1691         }
   1692       } else if (!isLiteralConstant(MO, getOpSize(MI->getOpcode(), Idx))) {
   1693         // If it is not a register and not a literal constant, then it must be
   1694         // an inline constant which is always legal.
   1695         continue;
   1696       }
   1697       // If we make it this far, then the operand is not legal and we must
   1698       // legalize it.
   1699       legalizeOpWithMove(MI, Idx);
   1700     }
   1701   }
   1702 
   1703   // Legalize REG_SEQUENCE and PHI
   1704   // The register class of the operands much be the same type as the register
   1705   // class of the output.
   1706   if (MI->getOpcode() == AMDGPU::REG_SEQUENCE ||
   1707       MI->getOpcode() == AMDGPU::PHI) {
   1708     const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
   1709     for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
   1710       if (!MI->getOperand(i).isReg() ||
   1711           !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
   1712         continue;
   1713       const TargetRegisterClass *OpRC =
   1714               MRI.getRegClass(MI->getOperand(i).getReg());
   1715       if (RI.hasVGPRs(OpRC)) {
   1716         VRC = OpRC;
   1717       } else {
   1718         SRC = OpRC;
   1719       }
   1720     }
   1721 
   1722     // If any of the operands are VGPR registers, then they all most be
   1723     // otherwise we will create illegal VGPR->SGPR copies when legalizing
   1724     // them.
   1725     if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) {
   1726       if (!VRC) {
   1727         assert(SRC);
   1728         VRC = RI.getEquivalentVGPRClass(SRC);
   1729       }
   1730       RC = VRC;
   1731     } else {
   1732       RC = SRC;
   1733     }
   1734 
   1735     // Update all the operands so they have the same type.
   1736     for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
   1737       if (!MI->getOperand(i).isReg() ||
   1738           !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
   1739         continue;
   1740       unsigned DstReg = MRI.createVirtualRegister(RC);
   1741       MachineBasicBlock *InsertBB;
   1742       MachineBasicBlock::iterator Insert;
   1743       if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
   1744         InsertBB = MI->getParent();
   1745         Insert = MI;
   1746       } else {
   1747         // MI is a PHI instruction.
   1748         InsertBB = MI->getOperand(i + 1).getMBB();
   1749         Insert = InsertBB->getFirstTerminator();
   1750       }
   1751       BuildMI(*InsertBB, Insert, MI->getDebugLoc(),
   1752               get(AMDGPU::COPY), DstReg)
   1753               .addOperand(MI->getOperand(i));
   1754       MI->getOperand(i).setReg(DstReg);
   1755     }
   1756   }
   1757 
   1758   // Legalize INSERT_SUBREG
   1759   // src0 must have the same register class as dst
   1760   if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) {
   1761     unsigned Dst = MI->getOperand(0).getReg();
   1762     unsigned Src0 = MI->getOperand(1).getReg();
   1763     const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
   1764     const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
   1765     if (DstRC != Src0RC) {
   1766       MachineBasicBlock &MBB = *MI->getParent();
   1767       unsigned NewSrc0 = MRI.createVirtualRegister(DstRC);
   1768       BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0)
   1769               .addReg(Src0);
   1770       MI->getOperand(1).setReg(NewSrc0);
   1771     }
   1772     return;
   1773   }
   1774 
   1775   // Legalize MUBUF* instructions
   1776   // FIXME: If we start using the non-addr64 instructions for compute, we
   1777   // may need to legalize them here.
   1778   int SRsrcIdx =
   1779       AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
   1780   if (SRsrcIdx != -1) {
   1781     // We have an MUBUF instruction
   1782     MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx);
   1783     unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass;
   1784     if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
   1785                                              RI.getRegClass(SRsrcRC))) {
   1786       // The operands are legal.
   1787       // FIXME: We may need to legalize operands besided srsrc.
   1788       return;
   1789     }
   1790 
   1791     MachineBasicBlock &MBB = *MI->getParent();
   1792     // Extract the the ptr from the resource descriptor.
   1793 
   1794     // SRsrcPtrLo = srsrc:sub0
   1795     unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc,
   1796         &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass);
   1797 
   1798     // SRsrcPtrHi = srsrc:sub1
   1799     unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc,
   1800         &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass);
   1801 
   1802     // Create an empty resource descriptor
   1803     unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
   1804     unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
   1805     unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
   1806     unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
   1807     uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
   1808 
   1809     // Zero64 = 0
   1810     BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64),
   1811             Zero64)
   1812             .addImm(0);
   1813 
   1814     // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
   1815     BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
   1816             SRsrcFormatLo)
   1817             .addImm(RsrcDataFormat & 0xFFFFFFFF);
   1818 
   1819     // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
   1820     BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
   1821             SRsrcFormatHi)
   1822             .addImm(RsrcDataFormat >> 32);
   1823 
   1824     // NewSRsrc = {Zero64, SRsrcFormat}
   1825     BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
   1826             NewSRsrc)
   1827             .addReg(Zero64)
   1828             .addImm(AMDGPU::sub0_sub1)
   1829             .addReg(SRsrcFormatLo)
   1830             .addImm(AMDGPU::sub2)
   1831             .addReg(SRsrcFormatHi)
   1832             .addImm(AMDGPU::sub3);
   1833 
   1834     MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
   1835     unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
   1836     unsigned NewVAddrLo;
   1837     unsigned NewVAddrHi;
   1838     if (VAddr) {
   1839       // This is already an ADDR64 instruction so we need to add the pointer
   1840       // extracted from the resource descriptor to the current value of VAddr.
   1841       NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   1842       NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   1843 
   1844       // NewVaddrLo = SRsrcPtrLo + VAddr:sub0
   1845       BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32),
   1846               NewVAddrLo)
   1847               .addReg(SRsrcPtrLo)
   1848               .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
   1849               .addReg(AMDGPU::VCC, RegState::ImplicitDefine);
   1850 
   1851       // NewVaddrHi = SRsrcPtrHi + VAddr:sub1
   1852       BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32),
   1853               NewVAddrHi)
   1854               .addReg(SRsrcPtrHi)
   1855               .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
   1856               .addReg(AMDGPU::VCC, RegState::ImplicitDefine)
   1857               .addReg(AMDGPU::VCC, RegState::Implicit);
   1858 
   1859     } else {
   1860       // This instructions is the _OFFSET variant, so we need to convert it to
   1861       // ADDR64.
   1862       MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata);
   1863       MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset);
   1864       MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset);
   1865 
   1866       // Create the new instruction.
   1867       unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode());
   1868       MachineInstr *Addr64 =
   1869           BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
   1870                   .addOperand(*VData)
   1871                   .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
   1872                                               // This will be replaced later
   1873                                               // with the new value of vaddr.
   1874                   .addOperand(*SRsrc)
   1875                   .addOperand(*SOffset)
   1876                   .addOperand(*Offset)
   1877                   .addImm(0) // glc
   1878                   .addImm(0) // slc
   1879                   .addImm(0); // tfe
   1880 
   1881       MI->removeFromParent();
   1882       MI = Addr64;
   1883 
   1884       NewVAddrLo = SRsrcPtrLo;
   1885       NewVAddrHi = SRsrcPtrHi;
   1886       VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
   1887       SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc);
   1888     }
   1889 
   1890     // NewVaddr = {NewVaddrHi, NewVaddrLo}
   1891     BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
   1892             NewVAddr)
   1893             .addReg(NewVAddrLo)
   1894             .addImm(AMDGPU::sub0)
   1895             .addReg(NewVAddrHi)
   1896             .addImm(AMDGPU::sub1);
   1897 
   1898 
   1899     // Update the instruction to use NewVaddr
   1900     VAddr->setReg(NewVAddr);
   1901     // Update the instruction to use NewSRsrc
   1902     SRsrc->setReg(NewSRsrc);
   1903   }
   1904 }
   1905 
   1906 void SIInstrInfo::splitSMRD(MachineInstr *MI,
   1907                             const TargetRegisterClass *HalfRC,
   1908                             unsigned HalfImmOp, unsigned HalfSGPROp,
   1909                             MachineInstr *&Lo, MachineInstr *&Hi) const {
   1910 
   1911   DebugLoc DL = MI->getDebugLoc();
   1912   MachineBasicBlock *MBB = MI->getParent();
   1913   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   1914   unsigned RegLo = MRI.createVirtualRegister(HalfRC);
   1915   unsigned RegHi = MRI.createVirtualRegister(HalfRC);
   1916   unsigned HalfSize = HalfRC->getSize();
   1917   const MachineOperand *OffOp =
   1918       getNamedOperand(*MI, AMDGPU::OpName::offset);
   1919   const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase);
   1920 
   1921   // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes
   1922   // on VI.
   1923 
   1924   bool IsKill = SBase->isKill();
   1925   if (OffOp) {
   1926     bool isVI =
   1927         MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >=
   1928         AMDGPUSubtarget::VOLCANIC_ISLANDS;
   1929     unsigned OffScale = isVI ? 1 : 4;
   1930     // Handle the _IMM variant
   1931     unsigned LoOffset = OffOp->getImm() * OffScale;
   1932     unsigned HiOffset = LoOffset + HalfSize;
   1933     Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo)
   1934                   // Use addReg instead of addOperand
   1935                   // to make sure kill flag is cleared.
   1936                   .addReg(SBase->getReg(), 0, SBase->getSubReg())
   1937                   .addImm(LoOffset / OffScale);
   1938 
   1939     if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) {
   1940       unsigned OffsetSGPR =
   1941           MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
   1942       BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR)
   1943               .addImm(HiOffset); // The offset in register is in bytes.
   1944       Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi)
   1945                     .addReg(SBase->getReg(), getKillRegState(IsKill),
   1946                             SBase->getSubReg())
   1947                     .addReg(OffsetSGPR);
   1948     } else {
   1949       Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi)
   1950                      .addReg(SBase->getReg(), getKillRegState(IsKill),
   1951                              SBase->getSubReg())
   1952                      .addImm(HiOffset / OffScale);
   1953     }
   1954   } else {
   1955     // Handle the _SGPR variant
   1956     MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff);
   1957     Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo)
   1958                   .addReg(SBase->getReg(), 0, SBase->getSubReg())
   1959                   .addOperand(*SOff);
   1960     unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
   1961     BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR)
   1962             .addOperand(*SOff)
   1963             .addImm(HalfSize);
   1964     Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp))
   1965                   .addReg(SBase->getReg(), getKillRegState(IsKill),
   1966                           SBase->getSubReg())
   1967                   .addReg(OffsetSGPR);
   1968   }
   1969 
   1970   unsigned SubLo, SubHi;
   1971   switch (HalfSize) {
   1972     case 4:
   1973       SubLo = AMDGPU::sub0;
   1974       SubHi = AMDGPU::sub1;
   1975       break;
   1976     case 8:
   1977       SubLo = AMDGPU::sub0_sub1;
   1978       SubHi = AMDGPU::sub2_sub3;
   1979       break;
   1980     case 16:
   1981       SubLo = AMDGPU::sub0_sub1_sub2_sub3;
   1982       SubHi = AMDGPU::sub4_sub5_sub6_sub7;
   1983       break;
   1984     case 32:
   1985       SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
   1986       SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15;
   1987       break;
   1988     default:
   1989       llvm_unreachable("Unhandled HalfSize");
   1990   }
   1991 
   1992   BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE))
   1993           .addOperand(MI->getOperand(0))
   1994           .addReg(RegLo)
   1995           .addImm(SubLo)
   1996           .addReg(RegHi)
   1997           .addImm(SubHi);
   1998 }
   1999 
   2000 void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const {
   2001   MachineBasicBlock *MBB = MI->getParent();
   2002   switch (MI->getOpcode()) {
   2003     case AMDGPU::S_LOAD_DWORD_IMM:
   2004     case AMDGPU::S_LOAD_DWORD_SGPR:
   2005     case AMDGPU::S_LOAD_DWORDX2_IMM:
   2006     case AMDGPU::S_LOAD_DWORDX2_SGPR:
   2007     case AMDGPU::S_LOAD_DWORDX4_IMM:
   2008     case AMDGPU::S_LOAD_DWORDX4_SGPR: {
   2009       unsigned NewOpcode = getVALUOp(*MI);
   2010       unsigned RegOffset;
   2011       unsigned ImmOffset;
   2012 
   2013       if (MI->getOperand(2).isReg()) {
   2014         RegOffset = MI->getOperand(2).getReg();
   2015         ImmOffset = 0;
   2016       } else {
   2017         assert(MI->getOperand(2).isImm());
   2018         // SMRD instructions take a dword offsets on SI and byte offset on VI
   2019         // and MUBUF instructions always take a byte offset.
   2020         ImmOffset = MI->getOperand(2).getImm();
   2021         if (MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <=
   2022             AMDGPUSubtarget::SEA_ISLANDS)
   2023           ImmOffset <<= 2;
   2024         RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
   2025 
   2026         if (isUInt<12>(ImmOffset)) {
   2027           BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
   2028                   RegOffset)
   2029                   .addImm(0);
   2030         } else {
   2031           BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
   2032                   RegOffset)
   2033                   .addImm(ImmOffset);
   2034           ImmOffset = 0;
   2035         }
   2036       }
   2037 
   2038       unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
   2039       unsigned DWord0 = RegOffset;
   2040       unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
   2041       unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
   2042       unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
   2043       uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
   2044 
   2045       BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1)
   2046               .addImm(0);
   2047       BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2)
   2048               .addImm(RsrcDataFormat & 0xFFFFFFFF);
   2049       BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3)
   2050               .addImm(RsrcDataFormat >> 32);
   2051       BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc)
   2052               .addReg(DWord0)
   2053               .addImm(AMDGPU::sub0)
   2054               .addReg(DWord1)
   2055               .addImm(AMDGPU::sub1)
   2056               .addReg(DWord2)
   2057               .addImm(AMDGPU::sub2)
   2058               .addReg(DWord3)
   2059               .addImm(AMDGPU::sub3);
   2060       MI->setDesc(get(NewOpcode));
   2061       if (MI->getOperand(2).isReg()) {
   2062         MI->getOperand(2).setReg(SRsrc);
   2063       } else {
   2064         MI->getOperand(2).ChangeToRegister(SRsrc, false);
   2065       }
   2066       MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0));
   2067       MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset));
   2068       MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // glc
   2069       MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // slc
   2070       MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // tfe
   2071 
   2072       const TargetRegisterClass *NewDstRC =
   2073           RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass);
   2074 
   2075       unsigned DstReg = MI->getOperand(0).getReg();
   2076       unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
   2077       MRI.replaceRegWith(DstReg, NewDstReg);
   2078       break;
   2079     }
   2080     case AMDGPU::S_LOAD_DWORDX8_IMM:
   2081     case AMDGPU::S_LOAD_DWORDX8_SGPR: {
   2082       MachineInstr *Lo, *Hi;
   2083       splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM,
   2084                 AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi);
   2085       MI->eraseFromParent();
   2086       moveSMRDToVALU(Lo, MRI);
   2087       moveSMRDToVALU(Hi, MRI);
   2088       break;
   2089     }
   2090 
   2091     case AMDGPU::S_LOAD_DWORDX16_IMM:
   2092     case AMDGPU::S_LOAD_DWORDX16_SGPR: {
   2093       MachineInstr *Lo, *Hi;
   2094       splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM,
   2095                 AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi);
   2096       MI->eraseFromParent();
   2097       moveSMRDToVALU(Lo, MRI);
   2098       moveSMRDToVALU(Hi, MRI);
   2099       break;
   2100     }
   2101   }
   2102 }
   2103 
   2104 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
   2105   SmallVector<MachineInstr *, 128> Worklist;
   2106   Worklist.push_back(&TopInst);
   2107 
   2108   while (!Worklist.empty()) {
   2109     MachineInstr *Inst = Worklist.pop_back_val();
   2110     MachineBasicBlock *MBB = Inst->getParent();
   2111     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   2112 
   2113     unsigned Opcode = Inst->getOpcode();
   2114     unsigned NewOpcode = getVALUOp(*Inst);
   2115 
   2116     // Handle some special cases
   2117     switch (Opcode) {
   2118     default:
   2119       if (isSMRD(Inst->getOpcode())) {
   2120         moveSMRDToVALU(Inst, MRI);
   2121       }
   2122       break;
   2123     case AMDGPU::S_MOV_B64: {
   2124       DebugLoc DL = Inst->getDebugLoc();
   2125 
   2126       // If the source operand is a register we can replace this with a
   2127       // copy.
   2128       if (Inst->getOperand(1).isReg()) {
   2129         MachineInstr *Copy = BuildMI(*MBB, Inst, DL, get(TargetOpcode::COPY))
   2130           .addOperand(Inst->getOperand(0))
   2131           .addOperand(Inst->getOperand(1));
   2132         Worklist.push_back(Copy);
   2133       } else {
   2134         // Otherwise, we need to split this into two movs, because there is
   2135         // no 64-bit VALU move instruction.
   2136         unsigned Reg = Inst->getOperand(0).getReg();
   2137         unsigned Dst = split64BitImm(Worklist,
   2138                                      Inst,
   2139                                      MRI,
   2140                                      MRI.getRegClass(Reg),
   2141                                      Inst->getOperand(1));
   2142         MRI.replaceRegWith(Reg, Dst);
   2143       }
   2144       Inst->eraseFromParent();
   2145       continue;
   2146     }
   2147     case AMDGPU::S_AND_B64:
   2148       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32);
   2149       Inst->eraseFromParent();
   2150       continue;
   2151 
   2152     case AMDGPU::S_OR_B64:
   2153       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32);
   2154       Inst->eraseFromParent();
   2155       continue;
   2156 
   2157     case AMDGPU::S_XOR_B64:
   2158       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32);
   2159       Inst->eraseFromParent();
   2160       continue;
   2161 
   2162     case AMDGPU::S_NOT_B64:
   2163       splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
   2164       Inst->eraseFromParent();
   2165       continue;
   2166 
   2167     case AMDGPU::S_BCNT1_I32_B64:
   2168       splitScalar64BitBCNT(Worklist, Inst);
   2169       Inst->eraseFromParent();
   2170       continue;
   2171 
   2172     case AMDGPU::S_BFE_I64: {
   2173       splitScalar64BitBFE(Worklist, Inst);
   2174       Inst->eraseFromParent();
   2175       continue;
   2176     }
   2177 
   2178     case AMDGPU::S_LSHL_B32:
   2179       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
   2180         NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
   2181         swapOperands(Inst);
   2182       }
   2183       break;
   2184     case AMDGPU::S_ASHR_I32:
   2185       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
   2186         NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
   2187         swapOperands(Inst);
   2188       }
   2189       break;
   2190     case AMDGPU::S_LSHR_B32:
   2191       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
   2192         NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
   2193         swapOperands(Inst);
   2194       }
   2195       break;
   2196     case AMDGPU::S_LSHL_B64:
   2197       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
   2198         NewOpcode = AMDGPU::V_LSHLREV_B64;
   2199         swapOperands(Inst);
   2200       }
   2201       break;
   2202     case AMDGPU::S_ASHR_I64:
   2203       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
   2204         NewOpcode = AMDGPU::V_ASHRREV_I64;
   2205         swapOperands(Inst);
   2206       }
   2207       break;
   2208     case AMDGPU::S_LSHR_B64:
   2209       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
   2210         NewOpcode = AMDGPU::V_LSHRREV_B64;
   2211         swapOperands(Inst);
   2212       }
   2213       break;
   2214 
   2215     case AMDGPU::S_BFE_U64:
   2216     case AMDGPU::S_BFM_B64:
   2217       llvm_unreachable("Moving this op to VALU not implemented");
   2218     }
   2219 
   2220     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
   2221       // We cannot move this instruction to the VALU, so we should try to
   2222       // legalize its operands instead.
   2223       legalizeOperands(Inst);
   2224       continue;
   2225     }
   2226 
   2227     // Use the new VALU Opcode.
   2228     const MCInstrDesc &NewDesc = get(NewOpcode);
   2229     Inst->setDesc(NewDesc);
   2230 
   2231     // Remove any references to SCC. Vector instructions can't read from it, and
   2232     // We're just about to add the implicit use / defs of VCC, and we don't want
   2233     // both.
   2234     for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) {
   2235       MachineOperand &Op = Inst->getOperand(i);
   2236       if (Op.isReg() && Op.getReg() == AMDGPU::SCC)
   2237         Inst->RemoveOperand(i);
   2238     }
   2239 
   2240     if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
   2241       // We are converting these to a BFE, so we need to add the missing
   2242       // operands for the size and offset.
   2243       unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
   2244       Inst->addOperand(MachineOperand::CreateImm(0));
   2245       Inst->addOperand(MachineOperand::CreateImm(Size));
   2246 
   2247     } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
   2248       // The VALU version adds the second operand to the result, so insert an
   2249       // extra 0 operand.
   2250       Inst->addOperand(MachineOperand::CreateImm(0));
   2251     }
   2252 
   2253     addDescImplicitUseDef(NewDesc, Inst);
   2254 
   2255     if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
   2256       const MachineOperand &OffsetWidthOp = Inst->getOperand(2);
   2257       // If we need to move this to VGPRs, we need to unpack the second operand
   2258       // back into the 2 separate ones for bit offset and width.
   2259       assert(OffsetWidthOp.isImm() &&
   2260              "Scalar BFE is only implemented for constant width and offset");
   2261       uint32_t Imm = OffsetWidthOp.getImm();
   2262 
   2263       uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
   2264       uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
   2265       Inst->RemoveOperand(2); // Remove old immediate.
   2266       Inst->addOperand(MachineOperand::CreateImm(Offset));
   2267       Inst->addOperand(MachineOperand::CreateImm(BitWidth));
   2268     }
   2269 
   2270     // Update the destination register class.
   2271 
   2272     const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0);
   2273 
   2274     switch (Opcode) {
   2275       // For target instructions, getOpRegClass just returns the virtual
   2276       // register class associated with the operand, so we need to find an
   2277       // equivalent VGPR register class in order to move the instruction to the
   2278       // VALU.
   2279     case AMDGPU::COPY:
   2280     case AMDGPU::PHI:
   2281     case AMDGPU::REG_SEQUENCE:
   2282     case AMDGPU::INSERT_SUBREG:
   2283       if (RI.hasVGPRs(NewDstRC))
   2284         continue;
   2285       NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
   2286       if (!NewDstRC)
   2287         continue;
   2288       break;
   2289     default:
   2290       break;
   2291     }
   2292 
   2293     unsigned DstReg = Inst->getOperand(0).getReg();
   2294     unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
   2295     MRI.replaceRegWith(DstReg, NewDstReg);
   2296 
   2297     // Legalize the operands
   2298     legalizeOperands(Inst);
   2299 
   2300     for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
   2301            E = MRI.use_end(); I != E; ++I) {
   2302       MachineInstr &UseMI = *I->getParent();
   2303       if (!canReadVGPR(UseMI, I.getOperandNo())) {
   2304         Worklist.push_back(&UseMI);
   2305       }
   2306     }
   2307   }
   2308 }
   2309 
   2310 //===----------------------------------------------------------------------===//
   2311 // Indirect addressing callbacks
   2312 //===----------------------------------------------------------------------===//
   2313 
   2314 unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex,
   2315                                                  unsigned Channel) const {
   2316   assert(Channel == 0);
   2317   return RegIndex;
   2318 }
   2319 
   2320 const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
   2321   return &AMDGPU::VGPR_32RegClass;
   2322 }
   2323 
   2324 void SIInstrInfo::splitScalar64BitUnaryOp(
   2325   SmallVectorImpl<MachineInstr *> &Worklist,
   2326   MachineInstr *Inst,
   2327   unsigned Opcode) const {
   2328   MachineBasicBlock &MBB = *Inst->getParent();
   2329   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   2330 
   2331   MachineOperand &Dest = Inst->getOperand(0);
   2332   MachineOperand &Src0 = Inst->getOperand(1);
   2333   DebugLoc DL = Inst->getDebugLoc();
   2334 
   2335   MachineBasicBlock::iterator MII = Inst;
   2336 
   2337   const MCInstrDesc &InstDesc = get(Opcode);
   2338   const TargetRegisterClass *Src0RC = Src0.isReg() ?
   2339     MRI.getRegClass(Src0.getReg()) :
   2340     &AMDGPU::SGPR_32RegClass;
   2341 
   2342   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
   2343 
   2344   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
   2345                                                        AMDGPU::sub0, Src0SubRC);
   2346 
   2347   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
   2348   const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0);
   2349 
   2350   unsigned DestSub0 = MRI.createVirtualRegister(DestRC);
   2351   MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
   2352     .addOperand(SrcReg0Sub0);
   2353 
   2354   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
   2355                                                        AMDGPU::sub1, Src0SubRC);
   2356 
   2357   unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC);
   2358   MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
   2359     .addOperand(SrcReg0Sub1);
   2360 
   2361   unsigned FullDestReg = MRI.createVirtualRegister(DestRC);
   2362   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
   2363     .addReg(DestSub0)
   2364     .addImm(AMDGPU::sub0)
   2365     .addReg(DestSub1)
   2366     .addImm(AMDGPU::sub1);
   2367 
   2368   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
   2369 
   2370   // Try to legalize the operands in case we need to swap the order to keep it
   2371   // valid.
   2372   Worklist.push_back(LoHalf);
   2373   Worklist.push_back(HiHalf);
   2374 }
   2375 
   2376 void SIInstrInfo::splitScalar64BitBinaryOp(
   2377   SmallVectorImpl<MachineInstr *> &Worklist,
   2378   MachineInstr *Inst,
   2379   unsigned Opcode) const {
   2380   MachineBasicBlock &MBB = *Inst->getParent();
   2381   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   2382 
   2383   MachineOperand &Dest = Inst->getOperand(0);
   2384   MachineOperand &Src0 = Inst->getOperand(1);
   2385   MachineOperand &Src1 = Inst->getOperand(2);
   2386   DebugLoc DL = Inst->getDebugLoc();
   2387 
   2388   MachineBasicBlock::iterator MII = Inst;
   2389 
   2390   const MCInstrDesc &InstDesc = get(Opcode);
   2391   const TargetRegisterClass *Src0RC = Src0.isReg() ?
   2392     MRI.getRegClass(Src0.getReg()) :
   2393     &AMDGPU::SGPR_32RegClass;
   2394 
   2395   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
   2396   const TargetRegisterClass *Src1RC = Src1.isReg() ?
   2397     MRI.getRegClass(Src1.getReg()) :
   2398     &AMDGPU::SGPR_32RegClass;
   2399 
   2400   const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
   2401 
   2402   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
   2403                                                        AMDGPU::sub0, Src0SubRC);
   2404   MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
   2405                                                        AMDGPU::sub0, Src1SubRC);
   2406 
   2407   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
   2408   const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0);
   2409 
   2410   unsigned DestSub0 = MRI.createVirtualRegister(DestRC);
   2411   MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
   2412     .addOperand(SrcReg0Sub0)
   2413     .addOperand(SrcReg1Sub0);
   2414 
   2415   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
   2416                                                        AMDGPU::sub1, Src0SubRC);
   2417   MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
   2418                                                        AMDGPU::sub1, Src1SubRC);
   2419 
   2420   unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC);
   2421   MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
   2422     .addOperand(SrcReg0Sub1)
   2423     .addOperand(SrcReg1Sub1);
   2424 
   2425   unsigned FullDestReg = MRI.createVirtualRegister(DestRC);
   2426   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
   2427     .addReg(DestSub0)
   2428     .addImm(AMDGPU::sub0)
   2429     .addReg(DestSub1)
   2430     .addImm(AMDGPU::sub1);
   2431 
   2432   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
   2433 
   2434   // Try to legalize the operands in case we need to swap the order to keep it
   2435   // valid.
   2436   Worklist.push_back(LoHalf);
   2437   Worklist.push_back(HiHalf);
   2438 }
   2439 
   2440 void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
   2441                                        MachineInstr *Inst) const {
   2442   MachineBasicBlock &MBB = *Inst->getParent();
   2443   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   2444 
   2445   MachineBasicBlock::iterator MII = Inst;
   2446   DebugLoc DL = Inst->getDebugLoc();
   2447 
   2448   MachineOperand &Dest = Inst->getOperand(0);
   2449   MachineOperand &Src = Inst->getOperand(1);
   2450 
   2451   const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
   2452   const TargetRegisterClass *SrcRC = Src.isReg() ?
   2453     MRI.getRegClass(Src.getReg()) :
   2454     &AMDGPU::SGPR_32RegClass;
   2455 
   2456   unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   2457   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   2458 
   2459   const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
   2460 
   2461   MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
   2462                                                       AMDGPU::sub0, SrcSubRC);
   2463   MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
   2464                                                       AMDGPU::sub1, SrcSubRC);
   2465 
   2466   MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg)
   2467     .addOperand(SrcRegSub0)
   2468     .addImm(0);
   2469 
   2470   MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg)
   2471     .addOperand(SrcRegSub1)
   2472     .addReg(MidReg);
   2473 
   2474   MRI.replaceRegWith(Dest.getReg(), ResultReg);
   2475 
   2476   Worklist.push_back(First);
   2477   Worklist.push_back(Second);
   2478 }
   2479 
   2480 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
   2481                                       MachineInstr *Inst) const {
   2482   MachineBasicBlock &MBB = *Inst->getParent();
   2483   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   2484   MachineBasicBlock::iterator MII = Inst;
   2485   DebugLoc DL = Inst->getDebugLoc();
   2486 
   2487   MachineOperand &Dest = Inst->getOperand(0);
   2488   uint32_t Imm = Inst->getOperand(2).getImm();
   2489   uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
   2490   uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
   2491 
   2492   (void) Offset;
   2493 
   2494   // Only sext_inreg cases handled.
   2495   assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 &&
   2496          BitWidth <= 32 &&
   2497          Offset == 0 &&
   2498          "Not implemented");
   2499 
   2500   if (BitWidth < 32) {
   2501     unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   2502     unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   2503     unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
   2504 
   2505     BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
   2506       .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0)
   2507       .addImm(0)
   2508       .addImm(BitWidth);
   2509 
   2510     BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
   2511       .addImm(31)
   2512       .addReg(MidRegLo);
   2513 
   2514     BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
   2515       .addReg(MidRegLo)
   2516       .addImm(AMDGPU::sub0)
   2517       .addReg(MidRegHi)
   2518       .addImm(AMDGPU::sub1);
   2519 
   2520     MRI.replaceRegWith(Dest.getReg(), ResultReg);
   2521     return;
   2522   }
   2523 
   2524   MachineOperand &Src = Inst->getOperand(1);
   2525   unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   2526   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
   2527 
   2528   BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
   2529     .addImm(31)
   2530     .addReg(Src.getReg(), 0, AMDGPU::sub0);
   2531 
   2532   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
   2533     .addReg(Src.getReg(), 0, AMDGPU::sub0)
   2534     .addImm(AMDGPU::sub0)
   2535     .addReg(TmpReg)
   2536     .addImm(AMDGPU::sub1);
   2537 
   2538   MRI.replaceRegWith(Dest.getReg(), ResultReg);
   2539 }
   2540 
   2541 void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc,
   2542                                         MachineInstr *Inst) const {
   2543   // Add the implict and explicit register definitions.
   2544   if (NewDesc.ImplicitUses) {
   2545     for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) {
   2546       unsigned Reg = NewDesc.ImplicitUses[i];
   2547       Inst->addOperand(MachineOperand::CreateReg(Reg, false, true));
   2548     }
   2549   }
   2550 
   2551   if (NewDesc.ImplicitDefs) {
   2552     for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) {
   2553       unsigned Reg = NewDesc.ImplicitDefs[i];
   2554       Inst->addOperand(MachineOperand::CreateReg(Reg, true, true));
   2555     }
   2556   }
   2557 }
   2558 
   2559 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
   2560                                    int OpIndices[3]) const {
   2561   const MCInstrDesc &Desc = get(MI->getOpcode());
   2562 
   2563   // Find the one SGPR operand we are allowed to use.
   2564   unsigned SGPRReg = AMDGPU::NoRegister;
   2565 
   2566   // First we need to consider the instruction's operand requirements before
   2567   // legalizing. Some operands are required to be SGPRs, such as implicit uses
   2568   // of VCC, but we are still bound by the constant bus requirement to only use
   2569   // one.
   2570   //
   2571   // If the operand's class is an SGPR, we can never move it.
   2572 
   2573   for (const MachineOperand &MO : MI->implicit_operands()) {
   2574     // We only care about reads.
   2575     if (MO.isDef())
   2576       continue;
   2577 
   2578     if (MO.getReg() == AMDGPU::VCC)
   2579       return AMDGPU::VCC;
   2580 
   2581     if (MO.getReg() == AMDGPU::FLAT_SCR)
   2582       return AMDGPU::FLAT_SCR;
   2583   }
   2584 
   2585   unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
   2586   const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
   2587 
   2588   for (unsigned i = 0; i < 3; ++i) {
   2589     int Idx = OpIndices[i];
   2590     if (Idx == -1)
   2591       break;
   2592 
   2593     const MachineOperand &MO = MI->getOperand(Idx);
   2594     if (RI.isSGPRClassID(Desc.OpInfo[Idx].RegClass))
   2595       SGPRReg = MO.getReg();
   2596 
   2597     if (MO.isReg() && RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
   2598       UsedSGPRs[i] = MO.getReg();
   2599   }
   2600 
   2601   if (SGPRReg != AMDGPU::NoRegister)
   2602     return SGPRReg;
   2603 
   2604   // We don't have a required SGPR operand, so we have a bit more freedom in
   2605   // selecting operands to move.
   2606 
   2607   // Try to select the most used SGPR. If an SGPR is equal to one of the
   2608   // others, we choose that.
   2609   //
   2610   // e.g.
   2611   // V_FMA_F32 v0, s0, s0, s0 -> No moves
   2612   // V_FMA_F32 v0, s0, s1, s0 -> Move s1
   2613 
   2614   if (UsedSGPRs[0] != AMDGPU::NoRegister) {
   2615     if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
   2616       SGPRReg = UsedSGPRs[0];
   2617   }
   2618 
   2619   if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
   2620     if (UsedSGPRs[1] == UsedSGPRs[2])
   2621       SGPRReg = UsedSGPRs[1];
   2622   }
   2623 
   2624   return SGPRReg;
   2625 }
   2626 
   2627 MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
   2628                                    MachineBasicBlock *MBB,
   2629                                    MachineBasicBlock::iterator I,
   2630                                    unsigned ValueReg,
   2631                                    unsigned Address, unsigned OffsetReg) const {
   2632   const DebugLoc &DL = MBB->findDebugLoc(I);
   2633   unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister(
   2634                                       getIndirectIndexBegin(*MBB->getParent()));
   2635 
   2636   return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1))
   2637           .addReg(IndirectBaseReg, RegState::Define)
   2638           .addOperand(I->getOperand(0))
   2639           .addReg(IndirectBaseReg)
   2640           .addReg(OffsetReg)
   2641           .addImm(0)
   2642           .addReg(ValueReg);
   2643 }
   2644 
   2645 MachineInstrBuilder SIInstrInfo::buildIndirectRead(
   2646                                    MachineBasicBlock *MBB,
   2647                                    MachineBasicBlock::iterator I,
   2648                                    unsigned ValueReg,
   2649                                    unsigned Address, unsigned OffsetReg) const {
   2650   const DebugLoc &DL = MBB->findDebugLoc(I);
   2651   unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister(
   2652                                       getIndirectIndexBegin(*MBB->getParent()));
   2653 
   2654   return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC))
   2655           .addOperand(I->getOperand(0))
   2656           .addOperand(I->getOperand(1))
   2657           .addReg(IndirectBaseReg)
   2658           .addReg(OffsetReg)
   2659           .addImm(0);
   2660 
   2661 }
   2662 
   2663 void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved,
   2664                                             const MachineFunction &MF) const {
   2665   int End = getIndirectIndexEnd(MF);
   2666   int Begin = getIndirectIndexBegin(MF);
   2667 
   2668   if (End == -1)
   2669     return;
   2670 
   2671 
   2672   for (int Index = Begin; Index <= End; ++Index)
   2673     Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index));
   2674 
   2675   for (int Index = std::max(0, Begin - 1); Index <= End; ++Index)
   2676     Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index));
   2677 
   2678   for (int Index = std::max(0, Begin - 2); Index <= End; ++Index)
   2679     Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index));
   2680 
   2681   for (int Index = std::max(0, Begin - 3); Index <= End; ++Index)
   2682     Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index));
   2683 
   2684   for (int Index = std::max(0, Begin - 7); Index <= End; ++Index)
   2685     Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index));
   2686 
   2687   for (int Index = std::max(0, Begin - 15); Index <= End; ++Index)
   2688     Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index));
   2689 }
   2690 
   2691 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
   2692                                              unsigned OperandName) const {
   2693   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
   2694   if (Idx == -1)
   2695     return nullptr;
   2696 
   2697   return &MI.getOperand(Idx);
   2698 }
   2699 
   2700 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
   2701   uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
   2702   if (ST.isAmdHsaOS())
   2703     RsrcDataFormat |= (1ULL << 56);
   2704 
   2705   return RsrcDataFormat;
   2706 }
   2707