Home | History | Annotate | Download | only in AMDGPU
      1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 /// \file
      9 //===----------------------------------------------------------------------===//
     10 //
     11 
     12 #include "AMDGPU.h"
     13 #include "AMDGPUSubtarget.h"
     14 #include "SIInstrInfo.h"
     15 #include "SIMachineFunctionInfo.h"
     16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
     17 #include "llvm/ADT/DepthFirstIterator.h"
     18 #include "llvm/CodeGen/LiveIntervals.h"
     19 #include "llvm/CodeGen/MachineFunctionPass.h"
     20 #include "llvm/CodeGen/MachineInstrBuilder.h"
     21 #include "llvm/CodeGen/MachineRegisterInfo.h"
     22 #include "llvm/Support/Debug.h"
     23 #include "llvm/Support/raw_ostream.h"
     24 #include "llvm/Target/TargetMachine.h"
     25 
     26 #define DEBUG_TYPE "si-fold-operands"
     27 using namespace llvm;
     28 
     29 namespace {
     30 
     31 struct FoldCandidate {
     32   MachineInstr *UseMI;
     33   union {
     34     MachineOperand *OpToFold;
     35     uint64_t ImmToFold;
     36     int FrameIndexToFold;
     37   };
     38   unsigned char UseOpNo;
     39   MachineOperand::MachineOperandType Kind;
     40   bool Commuted;
     41 
     42   FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
     43                 bool Commuted_ = false) :
     44     UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()),
     45     Commuted(Commuted_) {
     46     if (FoldOp->isImm()) {
     47       ImmToFold = FoldOp->getImm();
     48     } else if (FoldOp->isFI()) {
     49       FrameIndexToFold = FoldOp->getIndex();
     50     } else {
     51       assert(FoldOp->isReg());
     52       OpToFold = FoldOp;
     53     }
     54   }
     55 
     56   bool isFI() const {
     57     return Kind == MachineOperand::MO_FrameIndex;
     58   }
     59 
     60   bool isImm() const {
     61     return Kind == MachineOperand::MO_Immediate;
     62   }
     63 
     64   bool isReg() const {
     65     return Kind == MachineOperand::MO_Register;
     66   }
     67 
     68   bool isCommuted() const {
     69     return Commuted;
     70   }
     71 };
     72 
     73 class SIFoldOperands : public MachineFunctionPass {
     74 public:
     75   static char ID;
     76   MachineRegisterInfo *MRI;
     77   const SIInstrInfo *TII;
     78   const SIRegisterInfo *TRI;
     79   const GCNSubtarget *ST;
     80 
     81   void foldOperand(MachineOperand &OpToFold,
     82                    MachineInstr *UseMI,
     83                    unsigned UseOpIdx,
     84                    SmallVectorImpl<FoldCandidate> &FoldList,
     85                    SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
     86 
     87   void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
     88 
     89   const MachineOperand *isClamp(const MachineInstr &MI) const;
     90   bool tryFoldClamp(MachineInstr &MI);
     91 
     92   std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
     93   bool tryFoldOMod(MachineInstr &MI);
     94 
     95 public:
     96   SIFoldOperands() : MachineFunctionPass(ID) {
     97     initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
     98   }
     99 
    100   bool runOnMachineFunction(MachineFunction &MF) override;
    101 
    102   StringRef getPassName() const override { return "SI Fold Operands"; }
    103 
    104   void getAnalysisUsage(AnalysisUsage &AU) const override {
    105     AU.setPreservesCFG();
    106     MachineFunctionPass::getAnalysisUsage(AU);
    107   }
    108 };
    109 
    110 } // End anonymous namespace.
    111 
    112 INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
    113                 "SI Fold Operands", false, false)
    114 
    115 char SIFoldOperands::ID = 0;
    116 
    117 char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
    118 
    119 // Wrapper around isInlineConstant that understands special cases when
    120 // instruction types are replaced during operand folding.
    121 static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
    122                                      const MachineInstr &UseMI,
    123                                      unsigned OpNo,
    124                                      const MachineOperand &OpToFold) {
    125   if (TII->isInlineConstant(UseMI, OpNo, OpToFold))
    126     return true;
    127 
    128   unsigned Opc = UseMI.getOpcode();
    129   switch (Opc) {
    130   case AMDGPU::V_MAC_F32_e64:
    131   case AMDGPU::V_MAC_F16_e64:
    132   case AMDGPU::V_FMAC_F32_e64: {
    133     // Special case for mac. Since this is replaced with mad when folded into
    134     // src2, we need to check the legality for the final instruction.
    135     int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
    136     if (static_cast<int>(OpNo) == Src2Idx) {
    137       bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
    138       bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
    139 
    140       unsigned Opc = IsFMA ?
    141         AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
    142       const MCInstrDesc &MadDesc = TII->get(Opc);
    143       return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
    144     }
    145     return false;
    146   }
    147   default:
    148     return false;
    149   }
    150 }
    151 
    152 FunctionPass *llvm::createSIFoldOperandsPass() {
    153   return new SIFoldOperands();
    154 }
    155 
    156 static bool updateOperand(FoldCandidate &Fold,
    157                           const TargetRegisterInfo &TRI) {
    158   MachineInstr *MI = Fold.UseMI;
    159   MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
    160   assert(Old.isReg());
    161 
    162   if (Fold.isImm()) {
    163     if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked) {
    164       // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
    165       // already set.
    166       unsigned Opcode = MI->getOpcode();
    167       int OpNo = MI->getOperandNo(&Old);
    168       int ModIdx = -1;
    169       if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
    170         ModIdx = AMDGPU::OpName::src0_modifiers;
    171       else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
    172         ModIdx = AMDGPU::OpName::src1_modifiers;
    173       else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
    174         ModIdx = AMDGPU::OpName::src2_modifiers;
    175       assert(ModIdx != -1);
    176       ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
    177       MachineOperand &Mod = MI->getOperand(ModIdx);
    178       unsigned Val = Mod.getImm();
    179       if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
    180         return false;
    181       // If upper part is all zero we do not need op_sel_hi.
    182       if (!isUInt<16>(Fold.ImmToFold)) {
    183         if (!(Fold.ImmToFold & 0xffff)) {
    184           Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
    185           Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
    186           Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
    187           return true;
    188         }
    189         Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
    190       }
    191     }
    192     Old.ChangeToImmediate(Fold.ImmToFold);
    193     return true;
    194   }
    195 
    196   if (Fold.isFI()) {
    197     Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
    198     return true;
    199   }
    200 
    201   MachineOperand *New = Fold.OpToFold;
    202   if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
    203       TargetRegisterInfo::isVirtualRegister(New->getReg())) {
    204     Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
    205 
    206     Old.setIsUndef(New->isUndef());
    207     return true;
    208   }
    209 
    210   // FIXME: Handle physical registers.
    211 
    212   return false;
    213 }
    214 
    215 static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
    216                               const MachineInstr *MI) {
    217   for (auto Candidate : FoldList) {
    218     if (Candidate.UseMI == MI)
    219       return true;
    220   }
    221   return false;
    222 }
    223 
    224 static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
    225                              MachineInstr *MI, unsigned OpNo,
    226                              MachineOperand *OpToFold,
    227                              const SIInstrInfo *TII) {
    228   if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
    229 
    230     // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
    231     unsigned Opc = MI->getOpcode();
    232     if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
    233          Opc == AMDGPU::V_FMAC_F32_e64) &&
    234         (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
    235       bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
    236       bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
    237       unsigned NewOpc = IsFMA ?
    238         AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
    239 
    240       // Check if changing this to a v_mad_{f16, f32} instruction will allow us
    241       // to fold the operand.
    242       MI->setDesc(TII->get(NewOpc));
    243       bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
    244       if (FoldAsMAD) {
    245         MI->untieRegOperand(OpNo);
    246         return true;
    247       }
    248       MI->setDesc(TII->get(Opc));
    249     }
    250 
    251     // Special case for s_setreg_b32
    252     if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) {
    253       MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32));
    254       FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
    255       return true;
    256     }
    257 
    258     // If we are already folding into another operand of MI, then
    259     // we can't commute the instruction, otherwise we risk making the
    260     // other fold illegal.
    261     if (isUseMIInFoldList(FoldList, MI))
    262       return false;
    263 
    264     // Operand is not legal, so try to commute the instruction to
    265     // see if this makes it possible to fold.
    266     unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
    267     unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
    268     bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1);
    269 
    270     if (CanCommute) {
    271       if (CommuteIdx0 == OpNo)
    272         OpNo = CommuteIdx1;
    273       else if (CommuteIdx1 == OpNo)
    274         OpNo = CommuteIdx0;
    275     }
    276 
    277     // One of operands might be an Imm operand, and OpNo may refer to it after
    278     // the call of commuteInstruction() below. Such situations are avoided
    279     // here explicitly as OpNo must be a register operand to be a candidate
    280     // for memory folding.
    281     if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
    282                        !MI->getOperand(CommuteIdx1).isReg()))
    283       return false;
    284 
    285     if (!CanCommute ||
    286         !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
    287       return false;
    288 
    289     if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
    290       TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
    291       return false;
    292     }
    293 
    294     FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold, true));
    295     return true;
    296   }
    297 
    298   FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
    299   return true;
    300 }
    301 
    302 // If the use operand doesn't care about the value, this may be an operand only
    303 // used for register indexing, in which case it is unsafe to fold.
    304 static bool isUseSafeToFold(const SIInstrInfo *TII,
    305                             const MachineInstr &MI,
    306                             const MachineOperand &UseMO) {
    307   return !UseMO.isUndef() && !TII->isSDWA(MI);
    308   //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
    309 }
    310 
    311 void SIFoldOperands::foldOperand(
    312   MachineOperand &OpToFold,
    313   MachineInstr *UseMI,
    314   unsigned UseOpIdx,
    315   SmallVectorImpl<FoldCandidate> &FoldList,
    316   SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
    317   const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
    318 
    319   if (!isUseSafeToFold(TII, *UseMI, UseOp))
    320     return;
    321 
    322   // FIXME: Fold operands with subregs.
    323   if (UseOp.isReg() && OpToFold.isReg()) {
    324     if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister)
    325       return;
    326 
    327     // Don't fold subregister extracts into tied operands, only if it is a full
    328     // copy since a subregister use tied to a full register def doesn't really
    329     // make sense. e.g. don't fold:
    330     //
    331     // %1 = COPY %0:sub1
    332     // %2<tied3> = V_MAC_{F16, F32} %3, %4, %1<tied0>
    333     //
    334     //  into
    335     // %2<tied3> = V_MAC_{F16, F32} %3, %4, %0:sub1<tied0>
    336     if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister)
    337       return;
    338   }
    339 
    340   // Special case for REG_SEQUENCE: We can't fold literals into
    341   // REG_SEQUENCE instructions, so we have to fold them into the
    342   // uses of REG_SEQUENCE.
    343   if (UseMI->isRegSequence()) {
    344     unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
    345     unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
    346 
    347     for (MachineRegisterInfo::use_iterator
    348            RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end();
    349          RSUse != RSE; ++RSUse) {
    350 
    351       MachineInstr *RSUseMI = RSUse->getParent();
    352       if (RSUse->getSubReg() != RegSeqDstSubReg)
    353         continue;
    354 
    355       foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
    356                   CopiesToReplace);
    357     }
    358 
    359     return;
    360   }
    361 
    362 
    363   bool FoldingImm = OpToFold.isImm();
    364 
    365   // In order to fold immediates into copies, we need to change the
    366   // copy to a MOV.
    367   if (FoldingImm && UseMI->isCopy()) {
    368     unsigned DestReg = UseMI->getOperand(0).getReg();
    369     const TargetRegisterClass *DestRC
    370       = TargetRegisterInfo::isVirtualRegister(DestReg) ?
    371       MRI->getRegClass(DestReg) :
    372       TRI->getPhysRegClass(DestReg);
    373 
    374     unsigned MovOp = TII->getMovOpcode(DestRC);
    375     if (MovOp == AMDGPU::COPY)
    376       return;
    377 
    378     UseMI->setDesc(TII->get(MovOp));
    379     CopiesToReplace.push_back(UseMI);
    380   } else {
    381     const MCInstrDesc &UseDesc = UseMI->getDesc();
    382 
    383     // Don't fold into target independent nodes.  Target independent opcodes
    384     // don't have defined register classes.
    385     if (UseDesc.isVariadic() ||
    386         UseOp.isImplicit() ||
    387         UseDesc.OpInfo[UseOpIdx].RegClass == -1)
    388       return;
    389   }
    390 
    391   if (!FoldingImm) {
    392     tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
    393 
    394     // FIXME: We could try to change the instruction from 64-bit to 32-bit
    395     // to enable more folding opportunites.  The shrink operands pass
    396     // already does this.
    397     return;
    398   }
    399 
    400 
    401   const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
    402   const TargetRegisterClass *FoldRC =
    403     TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
    404 
    405 
    406   // Split 64-bit constants into 32-bits for folding.
    407   if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
    408     unsigned UseReg = UseOp.getReg();
    409     const TargetRegisterClass *UseRC
    410       = TargetRegisterInfo::isVirtualRegister(UseReg) ?
    411       MRI->getRegClass(UseReg) :
    412       TRI->getPhysRegClass(UseReg);
    413 
    414     if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
    415       return;
    416 
    417     APInt Imm(64, OpToFold.getImm());
    418     if (UseOp.getSubReg() == AMDGPU::sub0) {
    419       Imm = Imm.getLoBits(32);
    420     } else {
    421       assert(UseOp.getSubReg() == AMDGPU::sub1);
    422       Imm = Imm.getHiBits(32);
    423     }
    424 
    425     MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
    426     tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
    427     return;
    428   }
    429 
    430 
    431 
    432   tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
    433 }
    434 
    435 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
    436                                   uint32_t LHS, uint32_t RHS) {
    437   switch (Opcode) {
    438   case AMDGPU::V_AND_B32_e64:
    439   case AMDGPU::V_AND_B32_e32:
    440   case AMDGPU::S_AND_B32:
    441     Result = LHS & RHS;
    442     return true;
    443   case AMDGPU::V_OR_B32_e64:
    444   case AMDGPU::V_OR_B32_e32:
    445   case AMDGPU::S_OR_B32:
    446     Result = LHS | RHS;
    447     return true;
    448   case AMDGPU::V_XOR_B32_e64:
    449   case AMDGPU::V_XOR_B32_e32:
    450   case AMDGPU::S_XOR_B32:
    451     Result = LHS ^ RHS;
    452     return true;
    453   case AMDGPU::V_LSHL_B32_e64:
    454   case AMDGPU::V_LSHL_B32_e32:
    455   case AMDGPU::S_LSHL_B32:
    456     // The instruction ignores the high bits for out of bounds shifts.
    457     Result = LHS << (RHS & 31);
    458     return true;
    459   case AMDGPU::V_LSHLREV_B32_e64:
    460   case AMDGPU::V_LSHLREV_B32_e32:
    461     Result = RHS << (LHS & 31);
    462     return true;
    463   case AMDGPU::V_LSHR_B32_e64:
    464   case AMDGPU::V_LSHR_B32_e32:
    465   case AMDGPU::S_LSHR_B32:
    466     Result = LHS >> (RHS & 31);
    467     return true;
    468   case AMDGPU::V_LSHRREV_B32_e64:
    469   case AMDGPU::V_LSHRREV_B32_e32:
    470     Result = RHS >> (LHS & 31);
    471     return true;
    472   case AMDGPU::V_ASHR_I32_e64:
    473   case AMDGPU::V_ASHR_I32_e32:
    474   case AMDGPU::S_ASHR_I32:
    475     Result = static_cast<int32_t>(LHS) >> (RHS & 31);
    476     return true;
    477   case AMDGPU::V_ASHRREV_I32_e64:
    478   case AMDGPU::V_ASHRREV_I32_e32:
    479     Result = static_cast<int32_t>(RHS) >> (LHS & 31);
    480     return true;
    481   default:
    482     return false;
    483   }
    484 }
    485 
    486 static unsigned getMovOpc(bool IsScalar) {
    487   return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
    488 }
    489 
    490 /// Remove any leftover implicit operands from mutating the instruction. e.g.
    491 /// if we replace an s_and_b32 with a copy, we don't need the implicit scc def
    492 /// anymore.
    493 static void stripExtraCopyOperands(MachineInstr &MI) {
    494   const MCInstrDesc &Desc = MI.getDesc();
    495   unsigned NumOps = Desc.getNumOperands() +
    496                     Desc.getNumImplicitUses() +
    497                     Desc.getNumImplicitDefs();
    498 
    499   for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
    500     MI.RemoveOperand(I);
    501 }
    502 
    503 static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
    504   MI.setDesc(NewDesc);
    505   stripExtraCopyOperands(MI);
    506 }
    507 
    508 static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
    509                                                MachineOperand &Op) {
    510   if (Op.isReg()) {
    511     // If this has a subregister, it obviously is a register source.
    512     if (Op.getSubReg() != AMDGPU::NoSubRegister ||
    513         !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
    514       return &Op;
    515 
    516     MachineInstr *Def = MRI.getVRegDef(Op.getReg());
    517     if (Def && Def->isMoveImmediate()) {
    518       MachineOperand &ImmSrc = Def->getOperand(1);
    519       if (ImmSrc.isImm())
    520         return &ImmSrc;
    521     }
    522   }
    523 
    524   return &Op;
    525 }
    526 
    527 // Try to simplify operations with a constant that may appear after instruction
    528 // selection.
    529 // TODO: See if a frame index with a fixed offset can fold.
    530 static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
    531                               const SIInstrInfo *TII,
    532                               MachineInstr *MI,
    533                               MachineOperand *ImmOp) {
    534   unsigned Opc = MI->getOpcode();
    535   if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
    536       Opc == AMDGPU::S_NOT_B32) {
    537     MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm());
    538     mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
    539     return true;
    540   }
    541 
    542   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
    543   if (Src1Idx == -1)
    544     return false;
    545 
    546   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
    547   MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
    548   MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx));
    549 
    550   if (!Src0->isImm() && !Src1->isImm())
    551     return false;
    552 
    553   // and k0, k1 -> v_mov_b32 (k0 & k1)
    554   // or k0, k1 -> v_mov_b32 (k0 | k1)
    555   // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
    556   if (Src0->isImm() && Src1->isImm()) {
    557     int32_t NewImm;
    558     if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
    559       return false;
    560 
    561     const SIRegisterInfo &TRI = TII->getRegisterInfo();
    562     bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg());
    563 
    564     // Be careful to change the right operand, src0 may belong to a different
    565     // instruction.
    566     MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
    567     MI->RemoveOperand(Src1Idx);
    568     mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
    569     return true;
    570   }
    571 
    572   if (!MI->isCommutable())
    573     return false;
    574 
    575   if (Src0->isImm() && !Src1->isImm()) {
    576     std::swap(Src0, Src1);
    577     std::swap(Src0Idx, Src1Idx);
    578   }
    579 
    580   int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
    581   if (Opc == AMDGPU::V_OR_B32_e64 ||
    582       Opc == AMDGPU::V_OR_B32_e32 ||
    583       Opc == AMDGPU::S_OR_B32) {
    584     if (Src1Val == 0) {
    585       // y = or x, 0 => y = copy x
    586       MI->RemoveOperand(Src1Idx);
    587       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
    588     } else if (Src1Val == -1) {
    589       // y = or x, -1 => y = v_mov_b32 -1
    590       MI->RemoveOperand(Src1Idx);
    591       mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
    592     } else
    593       return false;
    594 
    595     return true;
    596   }
    597 
    598   if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
    599       MI->getOpcode() == AMDGPU::V_AND_B32_e32 ||
    600       MI->getOpcode() == AMDGPU::S_AND_B32) {
    601     if (Src1Val == 0) {
    602       // y = and x, 0 => y = v_mov_b32 0
    603       MI->RemoveOperand(Src0Idx);
    604       mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
    605     } else if (Src1Val == -1) {
    606       // y = and x, -1 => y = copy x
    607       MI->RemoveOperand(Src1Idx);
    608       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
    609       stripExtraCopyOperands(*MI);
    610     } else
    611       return false;
    612 
    613     return true;
    614   }
    615 
    616   if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
    617       MI->getOpcode() == AMDGPU::V_XOR_B32_e32 ||
    618       MI->getOpcode() == AMDGPU::S_XOR_B32) {
    619     if (Src1Val == 0) {
    620       // y = xor x, 0 => y = copy x
    621       MI->RemoveOperand(Src1Idx);
    622       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
    623       return true;
    624     }
    625   }
    626 
    627   return false;
    628 }
    629 
    630 // Try to fold an instruction into a simpler one
    631 static bool tryFoldInst(const SIInstrInfo *TII,
    632                         MachineInstr *MI) {
    633   unsigned Opc = MI->getOpcode();
    634 
    635   if (Opc == AMDGPU::V_CNDMASK_B32_e32    ||
    636       Opc == AMDGPU::V_CNDMASK_B32_e64    ||
    637       Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) {
    638     const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
    639     const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
    640     if (Src1->isIdenticalTo(*Src0)) {
    641       LLVM_DEBUG(dbgs() << "Folded " << *MI << " into ");
    642       int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
    643       if (Src2Idx != -1)
    644         MI->RemoveOperand(Src2Idx);
    645       MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
    646       mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY
    647                                                : getMovOpc(false)));
    648       LLVM_DEBUG(dbgs() << *MI << '\n');
    649       return true;
    650     }
    651   }
    652 
    653   return false;
    654 }
    655 
    656 void SIFoldOperands::foldInstOperand(MachineInstr &MI,
    657                                      MachineOperand &OpToFold) const {
    658   // We need mutate the operands of new mov instructions to add implicit
    659   // uses of EXEC, but adding them invalidates the use_iterator, so defer
    660   // this.
    661   SmallVector<MachineInstr *, 4> CopiesToReplace;
    662   SmallVector<FoldCandidate, 4> FoldList;
    663   MachineOperand &Dst = MI.getOperand(0);
    664 
    665   bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
    666   if (FoldingImm) {
    667     unsigned NumLiteralUses = 0;
    668     MachineOperand *NonInlineUse = nullptr;
    669     int NonInlineUseOpNo = -1;
    670 
    671     MachineRegisterInfo::use_iterator NextUse;
    672     for (MachineRegisterInfo::use_iterator
    673            Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
    674          Use != E; Use = NextUse) {
    675       NextUse = std::next(Use);
    676       MachineInstr *UseMI = Use->getParent();
    677       unsigned OpNo = Use.getOperandNo();
    678 
    679       // Folding the immediate may reveal operations that can be constant
    680       // folded or replaced with a copy. This can happen for example after
    681       // frame indices are lowered to constants or from splitting 64-bit
    682       // constants.
    683       //
    684       // We may also encounter cases where one or both operands are
    685       // immediates materialized into a register, which would ordinarily not
    686       // be folded due to multiple uses or operand constraints.
    687 
    688       if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) {
    689         LLVM_DEBUG(dbgs() << "Constant folded " << *UseMI << '\n');
    690 
    691         // Some constant folding cases change the same immediate's use to a new
    692         // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
    693         // again. The same constant folded instruction could also have a second
    694         // use operand.
    695         NextUse = MRI->use_begin(Dst.getReg());
    696         FoldList.clear();
    697         continue;
    698       }
    699 
    700       // Try to fold any inline immediate uses, and then only fold other
    701       // constants if they have one use.
    702       //
    703       // The legality of the inline immediate must be checked based on the use
    704       // operand, not the defining instruction, because 32-bit instructions
    705       // with 32-bit inline immediate sources may be used to materialize
    706       // constants used in 16-bit operands.
    707       //
    708       // e.g. it is unsafe to fold:
    709       //  s_mov_b32 s0, 1.0    // materializes 0x3f800000
    710       //  v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
    711 
    712       // Folding immediates with more than one use will increase program size.
    713       // FIXME: This will also reduce register usage, which may be better
    714       // in some cases. A better heuristic is needed.
    715       if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
    716         foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
    717       } else {
    718         if (++NumLiteralUses == 1) {
    719           NonInlineUse = &*Use;
    720           NonInlineUseOpNo = OpNo;
    721         }
    722       }
    723     }
    724 
    725     if (NumLiteralUses == 1) {
    726       MachineInstr *UseMI = NonInlineUse->getParent();
    727       foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace);
    728     }
    729   } else {
    730     // Folding register.
    731     for (MachineRegisterInfo::use_iterator
    732            Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
    733          Use != E; ++Use) {
    734       MachineInstr *UseMI = Use->getParent();
    735 
    736       foldOperand(OpToFold, UseMI, Use.getOperandNo(),
    737                   FoldList, CopiesToReplace);
    738     }
    739   }
    740 
    741   MachineFunction *MF = MI.getParent()->getParent();
    742   // Make sure we add EXEC uses to any new v_mov instructions created.
    743   for (MachineInstr *Copy : CopiesToReplace)
    744     Copy->addImplicitDefUseOperands(*MF);
    745 
    746   for (FoldCandidate &Fold : FoldList) {
    747     if (updateOperand(Fold, *TRI)) {
    748       // Clear kill flags.
    749       if (Fold.isReg()) {
    750         assert(Fold.OpToFold && Fold.OpToFold->isReg());
    751         // FIXME: Probably shouldn't bother trying to fold if not an
    752         // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
    753         // copies.
    754         MRI->clearKillFlags(Fold.OpToFold->getReg());
    755       }
    756       LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
    757                         << static_cast<int>(Fold.UseOpNo) << " of "
    758                         << *Fold.UseMI << '\n');
    759       tryFoldInst(TII, Fold.UseMI);
    760     } else if (Fold.isCommuted()) {
    761       // Restoring instruction's original operand order if fold has failed.
    762       TII->commuteInstruction(*Fold.UseMI, false);
    763     }
    764   }
    765 }
    766 
    767 // Clamp patterns are canonically selected to v_max_* instructions, so only
    768 // handle them.
    769 const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
    770   unsigned Op = MI.getOpcode();
    771   switch (Op) {
    772   case AMDGPU::V_MAX_F32_e64:
    773   case AMDGPU::V_MAX_F16_e64:
    774   case AMDGPU::V_MAX_F64:
    775   case AMDGPU::V_PK_MAX_F16: {
    776     if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
    777       return nullptr;
    778 
    779     // Make sure sources are identical.
    780     const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
    781     const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
    782     if (!Src0->isReg() || !Src1->isReg() ||
    783         Src0->getReg() != Src1->getReg() ||
    784         Src0->getSubReg() != Src1->getSubReg() ||
    785         Src0->getSubReg() != AMDGPU::NoSubRegister)
    786       return nullptr;
    787 
    788     // Can't fold up if we have modifiers.
    789     if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
    790       return nullptr;
    791 
    792     unsigned Src0Mods
    793       = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
    794     unsigned Src1Mods
    795       = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
    796 
    797     // Having a 0 op_sel_hi would require swizzling the output in the source
    798     // instruction, which we can't do.
    799     unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 : 0;
    800     if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
    801       return nullptr;
    802     return Src0;
    803   }
    804   default:
    805     return nullptr;
    806   }
    807 }
    808 
    809 // We obviously have multiple uses in a clamp since the register is used twice
    810 // in the same instruction.
    811 static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) {
    812   int Count = 0;
    813   for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end();
    814        I != E; ++I) {
    815     if (++Count > 1)
    816       return false;
    817   }
    818 
    819   return true;
    820 }
    821 
    822 // FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
    823 bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
    824   const MachineOperand *ClampSrc = isClamp(MI);
    825   if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg()))
    826     return false;
    827 
    828   MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
    829 
    830   // The type of clamp must be compatible.
    831   if (TII->getClampMask(*Def) != TII->getClampMask(MI))
    832     return false;
    833 
    834   MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
    835   if (!DefClamp)
    836     return false;
    837 
    838   LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def
    839                     << '\n');
    840 
    841   // Clamp is applied after omod, so it is OK if omod is set.
    842   DefClamp->setImm(1);
    843   MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
    844   MI.eraseFromParent();
    845   return true;
    846 }
    847 
    848 static int getOModValue(unsigned Opc, int64_t Val) {
    849   switch (Opc) {
    850   case AMDGPU::V_MUL_F32_e64: {
    851     switch (static_cast<uint32_t>(Val)) {
    852     case 0x3f000000: // 0.5
    853       return SIOutMods::DIV2;
    854     case 0x40000000: // 2.0
    855       return SIOutMods::MUL2;
    856     case 0x40800000: // 4.0
    857       return SIOutMods::MUL4;
    858     default:
    859       return SIOutMods::NONE;
    860     }
    861   }
    862   case AMDGPU::V_MUL_F16_e64: {
    863     switch (static_cast<uint16_t>(Val)) {
    864     case 0x3800: // 0.5
    865       return SIOutMods::DIV2;
    866     case 0x4000: // 2.0
    867       return SIOutMods::MUL2;
    868     case 0x4400: // 4.0
    869       return SIOutMods::MUL4;
    870     default:
    871       return SIOutMods::NONE;
    872     }
    873   }
    874   default:
    875     llvm_unreachable("invalid mul opcode");
    876   }
    877 }
    878 
    879 // FIXME: Does this really not support denormals with f16?
    880 // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
    881 // handled, so will anything other than that break?
    882 std::pair<const MachineOperand *, int>
    883 SIFoldOperands::isOMod(const MachineInstr &MI) const {
    884   unsigned Op = MI.getOpcode();
    885   switch (Op) {
    886   case AMDGPU::V_MUL_F32_e64:
    887   case AMDGPU::V_MUL_F16_e64: {
    888     // If output denormals are enabled, omod is ignored.
    889     if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) ||
    890         (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals()))
    891       return std::make_pair(nullptr, SIOutMods::NONE);
    892 
    893     const MachineOperand *RegOp = nullptr;
    894     const MachineOperand *ImmOp = nullptr;
    895     const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
    896     const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
    897     if (Src0->isImm()) {
    898       ImmOp = Src0;
    899       RegOp = Src1;
    900     } else if (Src1->isImm()) {
    901       ImmOp = Src1;
    902       RegOp = Src0;
    903     } else
    904       return std::make_pair(nullptr, SIOutMods::NONE);
    905 
    906     int OMod = getOModValue(Op, ImmOp->getImm());
    907     if (OMod == SIOutMods::NONE ||
    908         TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
    909         TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
    910         TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
    911         TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
    912       return std::make_pair(nullptr, SIOutMods::NONE);
    913 
    914     return std::make_pair(RegOp, OMod);
    915   }
    916   case AMDGPU::V_ADD_F32_e64:
    917   case AMDGPU::V_ADD_F16_e64: {
    918     // If output denormals are enabled, omod is ignored.
    919     if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) ||
    920         (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals()))
    921       return std::make_pair(nullptr, SIOutMods::NONE);
    922 
    923     // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
    924     const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
    925     const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
    926 
    927     if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
    928         Src0->getSubReg() == Src1->getSubReg() &&
    929         !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
    930         !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
    931         !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
    932         !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
    933       return std::make_pair(Src0, SIOutMods::MUL2);
    934 
    935     return std::make_pair(nullptr, SIOutMods::NONE);
    936   }
    937   default:
    938     return std::make_pair(nullptr, SIOutMods::NONE);
    939   }
    940 }
    941 
    942 // FIXME: Does this need to check IEEE bit on function?
    943 bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
    944   const MachineOperand *RegOp;
    945   int OMod;
    946   std::tie(RegOp, OMod) = isOMod(MI);
    947   if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
    948       RegOp->getSubReg() != AMDGPU::NoSubRegister ||
    949       !hasOneNonDBGUseInst(*MRI, RegOp->getReg()))
    950     return false;
    951 
    952   MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
    953   MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
    954   if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
    955     return false;
    956 
    957   // Clamp is applied after omod. If the source already has clamp set, don't
    958   // fold it.
    959   if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
    960     return false;
    961 
    962   LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
    963 
    964   DefOMod->setImm(OMod);
    965   MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
    966   MI.eraseFromParent();
    967   return true;
    968 }
    969 
    970 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
    971   if (skipFunction(MF.getFunction()))
    972     return false;
    973 
    974   MRI = &MF.getRegInfo();
    975   ST = &MF.getSubtarget<GCNSubtarget>();
    976   TII = ST->getInstrInfo();
    977   TRI = &TII->getRegisterInfo();
    978 
    979   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    980 
    981   // omod is ignored by hardware if IEEE bit is enabled. omod also does not
    982   // correctly handle signed zeros.
    983   //
    984   // TODO: Check nsz on instructions when fast math flags are preserved to MI
    985   // level.
    986   bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath();
    987 
    988   for (MachineBasicBlock *MBB : depth_first(&MF)) {
    989     MachineBasicBlock::iterator I, Next;
    990     for (I = MBB->begin(); I != MBB->end(); I = Next) {
    991       Next = std::next(I);
    992       MachineInstr &MI = *I;
    993 
    994       tryFoldInst(TII, &MI);
    995 
    996       if (!TII->isFoldableCopy(MI)) {
    997         if (IsIEEEMode || !tryFoldOMod(MI))
    998           tryFoldClamp(MI);
    999         continue;
   1000       }
   1001 
   1002       MachineOperand &OpToFold = MI.getOperand(1);
   1003       bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
   1004 
   1005       // FIXME: We could also be folding things like TargetIndexes.
   1006       if (!FoldingImm && !OpToFold.isReg())
   1007         continue;
   1008 
   1009       if (OpToFold.isReg() &&
   1010           !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))
   1011         continue;
   1012 
   1013       // Prevent folding operands backwards in the function. For example,
   1014       // the COPY opcode must not be replaced by 1 in this example:
   1015       //
   1016       //    %3 = COPY %vgpr0; VGPR_32:%3
   1017       //    ...
   1018       //    %vgpr0 = V_MOV_B32_e32 1, implicit %exec
   1019       MachineOperand &Dst = MI.getOperand(0);
   1020       if (Dst.isReg() &&
   1021           !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
   1022         continue;
   1023 
   1024       foldInstOperand(MI, OpToFold);
   1025     }
   1026   }
   1027   return false;
   1028 }
   1029