Home | History | Annotate | Download | only in AMDGPU
      1 //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 /// \file This pass tries to apply several peephole SDWA patterns.
     11 ///
     12 /// E.g. original:
     13 ///   V_LSHRREV_B32_e32 %0, 16, %1
     14 ///   V_ADD_I32_e32 %2, %0, %3
     15 ///   V_LSHLREV_B32_e32 %4, 16, %2
     16 ///
     17 /// Replace:
     18 ///   V_ADD_I32_sdwa %4, %1, %3
     19 ///       dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
     20 ///
     21 //===----------------------------------------------------------------------===//
     22 
     23 #include "AMDGPU.h"
     24 #include "AMDGPUSubtarget.h"
     25 #include "SIDefines.h"
     26 #include "SIInstrInfo.h"
     27 #include "SIRegisterInfo.h"
     28 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
     29 #include "Utils/AMDGPUBaseInfo.h"
     30 #include "llvm/ADT/None.h"
     31 #include "llvm/ADT/Optional.h"
     32 #include "llvm/ADT/STLExtras.h"
     33 #include "llvm/ADT/SmallVector.h"
     34 #include "llvm/ADT/Statistic.h"
     35 #include "llvm/CodeGen/MachineBasicBlock.h"
     36 #include "llvm/CodeGen/MachineFunction.h"
     37 #include "llvm/CodeGen/MachineFunctionPass.h"
     38 #include "llvm/CodeGen/MachineInstr.h"
     39 #include "llvm/CodeGen/MachineInstrBuilder.h"
     40 #include "llvm/CodeGen/MachineOperand.h"
     41 #include "llvm/CodeGen/MachineRegisterInfo.h"
     42 #include "llvm/CodeGen/TargetRegisterInfo.h"
     43 #include "llvm/Config/llvm-config.h"
     44 #include "llvm/MC/LaneBitmask.h"
     45 #include "llvm/MC/MCInstrDesc.h"
     46 #include "llvm/Pass.h"
     47 #include "llvm/Support/Debug.h"
     48 #include "llvm/Support/raw_ostream.h"
     49 #include <algorithm>
     50 #include <cassert>
     51 #include <cstdint>
     52 #include <memory>
     53 #include <unordered_map>
     54 
     55 using namespace llvm;
     56 
     57 #define DEBUG_TYPE "si-peephole-sdwa"
     58 
     59 STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
     60 STATISTIC(NumSDWAInstructionsPeepholed,
     61           "Number of instruction converted to SDWA.");
     62 
     63 namespace {
     64 
     65 class SDWAOperand;
     66 class SDWADstOperand;
     67 
     68 class SIPeepholeSDWA : public MachineFunctionPass {
     69 public:
     70   using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
     71 
     72 private:
     73   MachineRegisterInfo *MRI;
     74   const SIRegisterInfo *TRI;
     75   const SIInstrInfo *TII;
     76 
     77   std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
     78   std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;
     79   SmallVector<MachineInstr *, 8> ConvertedInstructions;
     80 
     81   Optional<int64_t> foldToImm(const MachineOperand &Op) const;
     82 
     83 public:
     84   static char ID;
     85 
     86   SIPeepholeSDWA() : MachineFunctionPass(ID) {
     87     initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
     88   }
     89 
     90   bool runOnMachineFunction(MachineFunction &MF) override;
     91   void matchSDWAOperands(MachineBasicBlock &MBB);
     92   std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
     93   bool isConvertibleToSDWA(const MachineInstr &MI, const GCNSubtarget &ST) const;
     94   bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
     95   void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
     96 
     97   StringRef getPassName() const override { return "SI Peephole SDWA"; }
     98 
     99   void getAnalysisUsage(AnalysisUsage &AU) const override {
    100     AU.setPreservesCFG();
    101     MachineFunctionPass::getAnalysisUsage(AU);
    102   }
    103 };
    104 
    105 class SDWAOperand {
    106 private:
    107   MachineOperand *Target; // Operand that would be used in converted instruction
    108   MachineOperand *Replaced; // Operand that would be replace by Target
    109 
    110 public:
    111   SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
    112       : Target(TargetOp), Replaced(ReplacedOp) {
    113     assert(Target->isReg());
    114     assert(Replaced->isReg());
    115   }
    116 
    117   virtual ~SDWAOperand() = default;
    118 
    119   virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
    120   virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
    121 
    122   MachineOperand *getTargetOperand() const { return Target; }
    123   MachineOperand *getReplacedOperand() const { return Replaced; }
    124   MachineInstr *getParentInst() const { return Target->getParent(); }
    125 
    126   MachineRegisterInfo *getMRI() const {
    127     return &getParentInst()->getParent()->getParent()->getRegInfo();
    128   }
    129 
    130 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
    131   virtual void print(raw_ostream& OS) const = 0;
    132   void dump() const { print(dbgs()); }
    133 #endif
    134 };
    135 
    136 using namespace AMDGPU::SDWA;
    137 
    138 class SDWASrcOperand : public SDWAOperand {
    139 private:
    140   SdwaSel SrcSel;
    141   bool Abs;
    142   bool Neg;
    143   bool Sext;
    144 
    145 public:
    146   SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
    147                  SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
    148                  bool Sext_ = false)
    149       : SDWAOperand(TargetOp, ReplacedOp),
    150         SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
    151 
    152   MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
    153   bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
    154 
    155   SdwaSel getSrcSel() const { return SrcSel; }
    156   bool getAbs() const { return Abs; }
    157   bool getNeg() const { return Neg; }
    158   bool getSext() const { return Sext; }
    159 
    160   uint64_t getSrcMods(const SIInstrInfo *TII,
    161                       const MachineOperand *SrcOp) const;
    162 
    163 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
    164   void print(raw_ostream& OS) const override;
    165 #endif
    166 };
    167 
    168 class SDWADstOperand : public SDWAOperand {
    169 private:
    170   SdwaSel DstSel;
    171   DstUnused DstUn;
    172 
    173 public:
    174 
    175   SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
    176                  SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
    177     : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
    178 
    179   MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
    180   bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
    181 
    182   SdwaSel getDstSel() const { return DstSel; }
    183   DstUnused getDstUnused() const { return DstUn; }
    184 
    185 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
    186   void print(raw_ostream& OS) const override;
    187 #endif
    188 };
    189 
    190 class SDWADstPreserveOperand : public SDWADstOperand {
    191 private:
    192   MachineOperand *Preserve;
    193 
    194 public:
    195   SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
    196                          MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD)
    197       : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE),
    198         Preserve(PreserveOp) {}
    199 
    200   bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
    201 
    202   MachineOperand *getPreservedOperand() const { return Preserve; }
    203 
    204 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
    205   void print(raw_ostream& OS) const override;
    206 #endif
    207 };
    208 
    209 } // end anonymous namespace
    210 
    211 INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false)
    212 
    213 char SIPeepholeSDWA::ID = 0;
    214 
    215 char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID;
    216 
    217 FunctionPass *llvm::createSIPeepholeSDWAPass() {
    218   return new SIPeepholeSDWA();
    219 }
    220 
    221 
    222 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
    223 static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) {
    224   switch(Sel) {
    225   case BYTE_0: OS << "BYTE_0"; break;
    226   case BYTE_1: OS << "BYTE_1"; break;
    227   case BYTE_2: OS << "BYTE_2"; break;
    228   case BYTE_3: OS << "BYTE_3"; break;
    229   case WORD_0: OS << "WORD_0"; break;
    230   case WORD_1: OS << "WORD_1"; break;
    231   case DWORD:  OS << "DWORD"; break;
    232   }
    233   return OS;
    234 }
    235 
    236 static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
    237   switch(Un) {
    238   case UNUSED_PAD: OS << "UNUSED_PAD"; break;
    239   case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
    240   case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
    241   }
    242   return OS;
    243 }
    244 
    245 static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
    246   Operand.print(OS);
    247   return OS;
    248 }
    249 
    250 LLVM_DUMP_METHOD
    251 void SDWASrcOperand::print(raw_ostream& OS) const {
    252   OS << "SDWA src: " << *getTargetOperand()
    253     << " src_sel:" << getSrcSel()
    254     << " abs:" << getAbs() << " neg:" << getNeg()
    255     << " sext:" << getSext() << '\n';
    256 }
    257 
    258 LLVM_DUMP_METHOD
    259 void SDWADstOperand::print(raw_ostream& OS) const {
    260   OS << "SDWA dst: " << *getTargetOperand()
    261     << " dst_sel:" << getDstSel()
    262     << " dst_unused:" << getDstUnused() << '\n';
    263 }
    264 
    265 LLVM_DUMP_METHOD
    266 void SDWADstPreserveOperand::print(raw_ostream& OS) const {
    267   OS << "SDWA preserve dst: " << *getTargetOperand()
    268     << " dst_sel:" << getDstSel()
    269     << " preserve:" << *getPreservedOperand() << '\n';
    270 }
    271 
    272 #endif
    273 
    274 static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
    275   assert(To.isReg() && From.isReg());
    276   To.setReg(From.getReg());
    277   To.setSubReg(From.getSubReg());
    278   To.setIsUndef(From.isUndef());
    279   if (To.isUse()) {
    280     To.setIsKill(From.isKill());
    281   } else {
    282     To.setIsDead(From.isDead());
    283   }
    284 }
    285 
    286 static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
    287   return LHS.isReg() &&
    288          RHS.isReg() &&
    289          LHS.getReg() == RHS.getReg() &&
    290          LHS.getSubReg() == RHS.getSubReg();
    291 }
    292 
    293 static MachineOperand *findSingleRegUse(const MachineOperand *Reg,
    294                                         const MachineRegisterInfo *MRI) {
    295   if (!Reg->isReg() || !Reg->isDef())
    296     return nullptr;
    297 
    298   MachineOperand *ResMO = nullptr;
    299   for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) {
    300     // If there exist use of subreg of Reg then return nullptr
    301     if (!isSameReg(UseMO, *Reg))
    302       return nullptr;
    303 
    304     // Check that there is only one instruction that uses Reg
    305     if (!ResMO) {
    306       ResMO = &UseMO;
    307     } else if (ResMO->getParent() != UseMO.getParent()) {
    308       return nullptr;
    309     }
    310   }
    311 
    312   return ResMO;
    313 }
    314 
    315 static MachineOperand *findSingleRegDef(const MachineOperand *Reg,
    316                                         const MachineRegisterInfo *MRI) {
    317   if (!Reg->isReg())
    318     return nullptr;
    319 
    320   MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg());
    321   if (!DefInstr)
    322     return nullptr;
    323 
    324   for (auto &DefMO : DefInstr->defs()) {
    325     if (DefMO.isReg() && DefMO.getReg() == Reg->getReg())
    326       return &DefMO;
    327   }
    328 
    329   // Ignore implicit defs.
    330   return nullptr;
    331 }
    332 
    333 uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
    334                                     const MachineOperand *SrcOp) const {
    335   uint64_t Mods = 0;
    336   const auto *MI = SrcOp->getParent();
    337   if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) {
    338     if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
    339       Mods = Mod->getImm();
    340     }
    341   } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) {
    342     if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) {
    343       Mods = Mod->getImm();
    344     }
    345   }
    346   if (Abs || Neg) {
    347     assert(!Sext &&
    348            "Float and integer src modifiers can't be set simulteniously");
    349     Mods |= Abs ? SISrcMods::ABS : 0;
    350     Mods ^= Neg ? SISrcMods::NEG : 0;
    351   } else if (Sext) {
    352     Mods |= SISrcMods::SEXT;
    353   }
    354 
    355   return Mods;
    356 }
    357 
    358 MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
    359   // For SDWA src operand potential instruction is one that use register
    360   // defined by parent instruction
    361   MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
    362   if (!PotentialMO)
    363     return nullptr;
    364 
    365   return PotentialMO->getParent();
    366 }
    367 
    368 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
    369   // Find operand in instruction that matches source operand and replace it with
    370   // target operand. Set corresponding src_sel
    371   bool IsPreserveSrc = false;
    372   MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
    373   MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
    374   MachineOperand *SrcMods =
    375       TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
    376   assert(Src && (Src->isReg() || Src->isImm()));
    377   if (!isSameReg(*Src, *getReplacedOperand())) {
    378     // If this is not src0 then it could be src1
    379     Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
    380     SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
    381     SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
    382 
    383     if (!Src ||
    384         !isSameReg(*Src, *getReplacedOperand())) {
    385       // It's possible this Src is a tied operand for
    386       // UNUSED_PRESERVE, in which case we can either
    387       // abandon the peephole attempt, or if legal we can
    388       // copy the target operand into the tied slot
    389       // if the preserve operation will effectively cause the same
    390       // result by overwriting the rest of the dst.
    391       MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
    392       MachineOperand *DstUnused =
    393         TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
    394 
    395       if (Dst &&
    396           DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
    397         // This will work if the tied src is acessing WORD_0, and the dst is
    398         // writing WORD_1. Modifiers don't matter because all the bits that
    399         // would be impacted are being overwritten by the dst.
    400         // Any other case will not work.
    401         SdwaSel DstSel = static_cast<SdwaSel>(
    402             TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
    403         if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
    404             getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
    405           IsPreserveSrc = true;
    406           auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
    407                                                    AMDGPU::OpName::vdst);
    408           auto TiedIdx = MI.findTiedOperandIdx(DstIdx);
    409           Src = &MI.getOperand(TiedIdx);
    410           SrcSel = nullptr;
    411           SrcMods = nullptr;
    412         } else {
    413           // Not legal to convert this src
    414           return false;
    415         }
    416       }
    417     }
    418     assert(Src && Src->isReg());
    419 
    420     if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
    421          MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
    422          !isSameReg(*Src, *getReplacedOperand())) {
    423       // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
    424       // src2. This is not allowed.
    425       return false;
    426     }
    427 
    428     assert(isSameReg(*Src, *getReplacedOperand()) &&
    429            (IsPreserveSrc || (SrcSel && SrcMods)));
    430   }
    431   copyRegOperand(*Src, *getTargetOperand());
    432   if (!IsPreserveSrc) {
    433     SrcSel->setImm(getSrcSel());
    434     SrcMods->setImm(getSrcMods(TII, Src));
    435   }
    436   getTargetOperand()->setIsKill(false);
    437   return true;
    438 }
    439 
    440 MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
    441   // For SDWA dst operand potential instruction is one that defines register
    442   // that this operand uses
    443   MachineRegisterInfo *MRI = getMRI();
    444   MachineInstr *ParentMI = getParentInst();
    445 
    446   MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI);
    447   if (!PotentialMO)
    448     return nullptr;
    449 
    450   // Check that ParentMI is the only instruction that uses replaced register
    451   for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) {
    452     if (&UseInst != ParentMI)
    453       return nullptr;
    454   }
    455 
    456   return PotentialMO->getParent();
    457 }
    458 
    459 bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
    460   // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
    461 
    462   if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
    463        MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
    464       getDstSel() != AMDGPU::SDWA::DWORD) {
    465     // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
    466     return false;
    467   }
    468 
    469   MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
    470   assert(Operand &&
    471          Operand->isReg() &&
    472          isSameReg(*Operand, *getReplacedOperand()));
    473   copyRegOperand(*Operand, *getTargetOperand());
    474   MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
    475   assert(DstSel);
    476   DstSel->setImm(getDstSel());
    477   MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
    478   assert(DstUnused);
    479   DstUnused->setImm(getDstUnused());
    480 
    481   // Remove original instruction  because it would conflict with our new
    482   // instruction by register definition
    483   getParentInst()->eraseFromParent();
    484   return true;
    485 }
    486 
    487 bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
    488                                            const SIInstrInfo *TII) {
    489   // MI should be moved right before v_or_b32.
    490   // For this we should clear all kill flags on uses of MI src-operands or else
    491   // we can encounter problem with use of killed operand.
    492   for (MachineOperand &MO : MI.uses()) {
    493     if (!MO.isReg())
    494       continue;
    495     getMRI()->clearKillFlags(MO.getReg());
    496   }
    497 
    498   // Move MI before v_or_b32
    499   auto MBB = MI.getParent();
    500   MBB->remove(&MI);
    501   MBB->insert(getParentInst(), &MI);
    502 
    503   // Add Implicit use of preserved register
    504   MachineInstrBuilder MIB(*MBB->getParent(), MI);
    505   MIB.addReg(getPreservedOperand()->getReg(),
    506              RegState::ImplicitKill,
    507              getPreservedOperand()->getSubReg());
    508 
    509   // Tie dst to implicit use
    510   MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst),
    511                  MI.getNumOperands() - 1);
    512 
    513   // Convert MI as any other SDWADstOperand and remove v_or_b32
    514   return SDWADstOperand::convertToSDWA(MI, TII);
    515 }
    516 
    517 Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
    518   if (Op.isImm()) {
    519     return Op.getImm();
    520   }
    521 
    522   // If this is not immediate then it can be copy of immediate value, e.g.:
    523   // %1 = S_MOV_B32 255;
    524   if (Op.isReg()) {
    525     for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
    526       if (!isSameReg(Op, Def))
    527         continue;
    528 
    529       const MachineInstr *DefInst = Def.getParent();
    530       if (!TII->isFoldableCopy(*DefInst))
    531         return None;
    532 
    533       const MachineOperand &Copied = DefInst->getOperand(1);
    534       if (!Copied.isImm())
    535         return None;
    536 
    537       return Copied.getImm();
    538     }
    539   }
    540 
    541   return None;
    542 }
    543 
    544 std::unique_ptr<SDWAOperand>
    545 SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
    546   unsigned Opcode = MI.getOpcode();
    547   switch (Opcode) {
    548   case AMDGPU::V_LSHRREV_B32_e32:
    549   case AMDGPU::V_ASHRREV_I32_e32:
    550   case AMDGPU::V_LSHLREV_B32_e32:
    551   case AMDGPU::V_LSHRREV_B32_e64:
    552   case AMDGPU::V_ASHRREV_I32_e64:
    553   case AMDGPU::V_LSHLREV_B32_e64: {
    554     // from: v_lshrrev_b32_e32 v1, 16/24, v0
    555     // to SDWA src:v0 src_sel:WORD_1/BYTE_3
    556 
    557     // from: v_ashrrev_i32_e32 v1, 16/24, v0
    558     // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
    559 
    560     // from: v_lshlrev_b32_e32 v1, 16/24, v0
    561     // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
    562     MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
    563     auto Imm = foldToImm(*Src0);
    564     if (!Imm)
    565       break;
    566 
    567     if (*Imm != 16 && *Imm != 24)
    568       break;
    569 
    570     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
    571     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
    572     if (TRI->isPhysicalRegister(Src1->getReg()) ||
    573         TRI->isPhysicalRegister(Dst->getReg()))
    574       break;
    575 
    576     if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
    577         Opcode == AMDGPU::V_LSHLREV_B32_e64) {
    578       return make_unique<SDWADstOperand>(
    579           Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
    580     } else {
    581       return make_unique<SDWASrcOperand>(
    582           Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
    583           Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
    584           Opcode != AMDGPU::V_LSHRREV_B32_e64);
    585     }
    586     break;
    587   }
    588 
    589   case AMDGPU::V_LSHRREV_B16_e32:
    590   case AMDGPU::V_ASHRREV_I16_e32:
    591   case AMDGPU::V_LSHLREV_B16_e32:
    592   case AMDGPU::V_LSHRREV_B16_e64:
    593   case AMDGPU::V_ASHRREV_I16_e64:
    594   case AMDGPU::V_LSHLREV_B16_e64: {
    595     // from: v_lshrrev_b16_e32 v1, 8, v0
    596     // to SDWA src:v0 src_sel:BYTE_1
    597 
    598     // from: v_ashrrev_i16_e32 v1, 8, v0
    599     // to SDWA src:v0 src_sel:BYTE_1 sext:1
    600 
    601     // from: v_lshlrev_b16_e32 v1, 8, v0
    602     // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
    603     MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
    604     auto Imm = foldToImm(*Src0);
    605     if (!Imm || *Imm != 8)
    606       break;
    607 
    608     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
    609     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
    610 
    611     if (TRI->isPhysicalRegister(Src1->getReg()) ||
    612         TRI->isPhysicalRegister(Dst->getReg()))
    613       break;
    614 
    615     if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
    616         Opcode == AMDGPU::V_LSHLREV_B16_e64) {
    617       return make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
    618     } else {
    619       return make_unique<SDWASrcOperand>(
    620             Src1, Dst, BYTE_1, false, false,
    621             Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
    622             Opcode != AMDGPU::V_LSHRREV_B16_e64);
    623     }
    624     break;
    625   }
    626 
    627   case AMDGPU::V_BFE_I32:
    628   case AMDGPU::V_BFE_U32: {
    629     // e.g.:
    630     // from: v_bfe_u32 v1, v0, 8, 8
    631     // to SDWA src:v0 src_sel:BYTE_1
    632 
    633     // offset | width | src_sel
    634     // ------------------------
    635     // 0      | 8     | BYTE_0
    636     // 0      | 16    | WORD_0
    637     // 0      | 32    | DWORD ?
    638     // 8      | 8     | BYTE_1
    639     // 16     | 8     | BYTE_2
    640     // 16     | 16    | WORD_1
    641     // 24     | 8     | BYTE_3
    642 
    643     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
    644     auto Offset = foldToImm(*Src1);
    645     if (!Offset)
    646       break;
    647 
    648     MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
    649     auto Width = foldToImm(*Src2);
    650     if (!Width)
    651       break;
    652 
    653     SdwaSel SrcSel = DWORD;
    654 
    655     if (*Offset == 0 && *Width == 8)
    656       SrcSel = BYTE_0;
    657     else if (*Offset == 0 && *Width == 16)
    658       SrcSel = WORD_0;
    659     else if (*Offset == 0 && *Width == 32)
    660       SrcSel = DWORD;
    661     else if (*Offset == 8 && *Width == 8)
    662       SrcSel = BYTE_1;
    663     else if (*Offset == 16 && *Width == 8)
    664       SrcSel = BYTE_2;
    665     else if (*Offset == 16 && *Width == 16)
    666       SrcSel = WORD_1;
    667     else if (*Offset == 24 && *Width == 8)
    668       SrcSel = BYTE_3;
    669     else
    670       break;
    671 
    672     MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
    673     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
    674 
    675     if (TRI->isPhysicalRegister(Src0->getReg()) ||
    676         TRI->isPhysicalRegister(Dst->getReg()))
    677       break;
    678 
    679     return make_unique<SDWASrcOperand>(
    680           Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32);
    681   }
    682 
    683   case AMDGPU::V_AND_B32_e32:
    684   case AMDGPU::V_AND_B32_e64: {
    685     // e.g.:
    686     // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
    687     // to SDWA src:v0 src_sel:WORD_0/BYTE_0
    688 
    689     MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
    690     MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
    691     auto ValSrc = Src1;
    692     auto Imm = foldToImm(*Src0);
    693 
    694     if (!Imm) {
    695       Imm = foldToImm(*Src1);
    696       ValSrc = Src0;
    697     }
    698 
    699     if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
    700       break;
    701 
    702     MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
    703 
    704     if (TRI->isPhysicalRegister(ValSrc->getReg()) ||
    705         TRI->isPhysicalRegister(Dst->getReg()))
    706       break;
    707 
    708     return make_unique<SDWASrcOperand>(
    709         ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
    710   }
    711 
    712   case AMDGPU::V_OR_B32_e32:
    713   case AMDGPU::V_OR_B32_e64: {
    714     // Patterns for dst_unused:UNUSED_PRESERVE.
    715     // e.g., from:
    716     // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
    717     //                           src1_sel:WORD_1 src2_sel:WORD1
    718     // v_add_f16_e32 v3, v1, v2
    719     // v_or_b32_e32 v4, v0, v3
    720     // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
    721 
    722     // Check if one of operands of v_or_b32 is SDWA instruction
    723     using CheckRetType = Optional<std::pair<MachineOperand *, MachineOperand *>>;
    724     auto CheckOROperandsForSDWA =
    725       [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType {
    726         if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg())
    727           return CheckRetType(None);
    728 
    729         MachineOperand *Op1Def = findSingleRegDef(Op1, MRI);
    730         if (!Op1Def)
    731           return CheckRetType(None);
    732 
    733         MachineInstr *Op1Inst = Op1Def->getParent();
    734         if (!TII->isSDWA(*Op1Inst))
    735           return CheckRetType(None);
    736 
    737         MachineOperand *Op2Def = findSingleRegDef(Op2, MRI);
    738         if (!Op2Def)
    739           return CheckRetType(None);
    740 
    741         return CheckRetType(std::make_pair(Op1Def, Op2Def));
    742       };
    743 
    744     MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
    745     MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
    746     assert(OrSDWA && OrOther);
    747     auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
    748     if (!Res) {
    749       OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
    750       OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
    751       assert(OrSDWA && OrOther);
    752       Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
    753       if (!Res)
    754         break;
    755     }
    756 
    757     MachineOperand *OrSDWADef = Res->first;
    758     MachineOperand *OrOtherDef = Res->second;
    759     assert(OrSDWADef && OrOtherDef);
    760 
    761     MachineInstr *SDWAInst = OrSDWADef->getParent();
    762     MachineInstr *OtherInst = OrOtherDef->getParent();
    763 
    764     // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
    765     // destination patterns don't overlap. Compatible instruction can be either
    766     // regular instruction with compatible bitness or SDWA instruction with
    767     // correct dst_sel
    768     // SDWAInst | OtherInst bitness / OtherInst dst_sel
    769     // -----------------------------------------------------
    770     // DWORD    | no                    / no
    771     // WORD_0   | no                    / BYTE_2/3, WORD_1
    772     // WORD_1   | 8/16-bit instructions / BYTE_0/1, WORD_0
    773     // BYTE_0   | no                    / BYTE_1/2/3, WORD_1
    774     // BYTE_1   | 8-bit                 / BYTE_0/2/3, WORD_1
    775     // BYTE_2   | 8/16-bit              / BYTE_0/1/3. WORD_0
    776     // BYTE_3   | 8/16/24-bit           / BYTE_0/1/2, WORD_0
    777     // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
    778     // but v_add_f32 is not.
    779 
    780     // TODO: add support for non-SDWA instructions as OtherInst.
    781     // For now this only works with SDWA instructions. For regular instructions
    782     // there is no way to determine if the instruction writes only 8/16/24-bit
    783     // out of full register size and all registers are at min 32-bit wide.
    784     if (!TII->isSDWA(*OtherInst))
    785       break;
    786 
    787     SdwaSel DstSel = static_cast<SdwaSel>(
    788       TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));;
    789     SdwaSel OtherDstSel = static_cast<SdwaSel>(
    790       TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel));
    791 
    792     bool DstSelAgree = false;
    793     switch (DstSel) {
    794     case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) ||
    795                                 (OtherDstSel == BYTE_3) ||
    796                                 (OtherDstSel == WORD_1));
    797       break;
    798     case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
    799                                 (OtherDstSel == BYTE_1) ||
    800                                 (OtherDstSel == WORD_0));
    801       break;
    802     case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) ||
    803                                 (OtherDstSel == BYTE_2) ||
    804                                 (OtherDstSel == BYTE_3) ||
    805                                 (OtherDstSel == WORD_1));
    806       break;
    807     case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
    808                                 (OtherDstSel == BYTE_2) ||
    809                                 (OtherDstSel == BYTE_3) ||
    810                                 (OtherDstSel == WORD_1));
    811       break;
    812     case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) ||
    813                                 (OtherDstSel == BYTE_1) ||
    814                                 (OtherDstSel == BYTE_3) ||
    815                                 (OtherDstSel == WORD_0));
    816       break;
    817     case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) ||
    818                                 (OtherDstSel == BYTE_1) ||
    819                                 (OtherDstSel == BYTE_2) ||
    820                                 (OtherDstSel == WORD_0));
    821       break;
    822     default: DstSelAgree = false;
    823     }
    824 
    825     if (!DstSelAgree)
    826       break;
    827 
    828     // Also OtherInst dst_unused should be UNUSED_PAD
    829     DstUnused OtherDstUnused = static_cast<DstUnused>(
    830       TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused));
    831     if (OtherDstUnused != DstUnused::UNUSED_PAD)
    832       break;
    833 
    834     // Create DstPreserveOperand
    835     MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
    836     assert(OrDst && OrDst->isReg());
    837 
    838     return make_unique<SDWADstPreserveOperand>(
    839       OrDst, OrSDWADef, OrOtherDef, DstSel);
    840 
    841   }
    842   }
    843 
    844   return std::unique_ptr<SDWAOperand>(nullptr);
    845 }
    846 
    847 void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
    848   for (MachineInstr &MI : MBB) {
    849     if (auto Operand = matchSDWAOperand(MI)) {
    850       LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
    851       SDWAOperands[&MI] = std::move(Operand);
    852       ++NumSDWAPatternsFound;
    853     }
    854   }
    855 }
    856 
    857 bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
    858                                          const GCNSubtarget &ST) const {
    859   // Check if this is already an SDWA instruction
    860   unsigned Opc = MI.getOpcode();
    861   if (TII->isSDWA(Opc))
    862     return true;
    863 
    864   // Check if this instruction has opcode that supports SDWA
    865   if (AMDGPU::getSDWAOp(Opc) == -1)
    866     Opc = AMDGPU::getVOPe32(Opc);
    867 
    868   if (AMDGPU::getSDWAOp(Opc) == -1)
    869     return false;
    870 
    871   if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
    872     return false;
    873 
    874   if (TII->isVOPC(Opc)) {
    875     if (!ST.hasSDWASdst()) {
    876       const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
    877       if (SDst && SDst->getReg() != AMDGPU::VCC)
    878         return false;
    879     }
    880 
    881     if (!ST.hasSDWAOutModsVOPC() &&
    882         (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
    883          TII->hasModifiersSet(MI, AMDGPU::OpName::omod)))
    884       return false;
    885 
    886   } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) ||
    887              !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
    888     return false;
    889   }
    890 
    891   if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 ||
    892                            Opc == AMDGPU::V_MAC_F32_e32))
    893     return false;
    894 
    895   // FIXME: has SDWA but require handling of implicit VCC use
    896   if (Opc == AMDGPU::V_CNDMASK_B32_e32)
    897     return false;
    898 
    899   return true;
    900 }
    901 
    902 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
    903                                    const SDWAOperandsVector &SDWAOperands) {
    904 
    905   LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
    906 
    907   // Convert to sdwa
    908   int SDWAOpcode;
    909   unsigned Opcode = MI.getOpcode();
    910   if (TII->isSDWA(Opcode)) {
    911     SDWAOpcode = Opcode;
    912   } else {
    913     SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
    914     if (SDWAOpcode == -1)
    915       SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode));
    916   }
    917   assert(SDWAOpcode != -1);
    918 
    919   const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
    920 
    921   // Create SDWA version of instruction MI and initialize its operands
    922   MachineInstrBuilder SDWAInst =
    923     BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc);
    924 
    925   // Copy dst, if it is present in original then should also be present in SDWA
    926   MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
    927   if (Dst) {
    928     assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
    929     SDWAInst.add(*Dst);
    930   } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) {
    931     assert(Dst &&
    932            AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
    933     SDWAInst.add(*Dst);
    934   } else {
    935     assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
    936     SDWAInst.addReg(AMDGPU::VCC, RegState::Define);
    937   }
    938 
    939   // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
    940   // src0_modifiers (except for v_nop_sdwa, but it can't get here)
    941   MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
    942   assert(
    943     Src0 &&
    944     AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 &&
    945     AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1);
    946   if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers))
    947     SDWAInst.addImm(Mod->getImm());
    948   else
    949     SDWAInst.addImm(0);
    950   SDWAInst.add(*Src0);
    951 
    952   // Copy src1 if present, initialize src1_modifiers.
    953   MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
    954   if (Src1) {
    955     assert(
    956       AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 &&
    957       AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1);
    958     if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers))
    959       SDWAInst.addImm(Mod->getImm());
    960     else
    961       SDWAInst.addImm(0);
    962     SDWAInst.add(*Src1);
    963   }
    964 
    965   if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
    966       SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
    967     // v_mac_f16/32 has additional src2 operand tied to vdst
    968     MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
    969     assert(Src2);
    970     SDWAInst.add(*Src2);
    971   }
    972 
    973   // Copy clamp if present, initialize otherwise
    974   assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1);
    975   MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
    976   if (Clamp) {
    977     SDWAInst.add(*Clamp);
    978   } else {
    979     SDWAInst.addImm(0);
    980   }
    981 
    982   // Copy omod if present, initialize otherwise if needed
    983   if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) {
    984     MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
    985     if (OMod) {
    986       SDWAInst.add(*OMod);
    987     } else {
    988       SDWAInst.addImm(0);
    989     }
    990   }
    991 
    992   // Copy dst_sel if present, initialize otherwise if needed
    993   if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) {
    994     MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
    995     if (DstSel) {
    996       SDWAInst.add(*DstSel);
    997     } else {
    998       SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
    999     }
   1000   }
   1001 
   1002   // Copy dst_unused if present, initialize otherwise if needed
   1003   if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) {
   1004     MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
   1005     if (DstUnused) {
   1006       SDWAInst.add(*DstUnused);
   1007     } else {
   1008       SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
   1009     }
   1010   }
   1011 
   1012   // Copy src0_sel if present, initialize otherwise
   1013   assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1);
   1014   MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
   1015   if (Src0Sel) {
   1016     SDWAInst.add(*Src0Sel);
   1017   } else {
   1018     SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
   1019   }
   1020 
   1021   // Copy src1_sel if present, initialize otherwise if needed
   1022   if (Src1) {
   1023     assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1);
   1024     MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
   1025     if (Src1Sel) {
   1026       SDWAInst.add(*Src1Sel);
   1027     } else {
   1028       SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
   1029     }
   1030   }
   1031 
   1032   // Check for a preserved register that needs to be copied.
   1033   auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
   1034   if (DstUnused &&
   1035       DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
   1036     // We expect, if we are here, that the instruction was already in it's SDWA form,
   1037     // with a tied operand.
   1038     assert(Dst && Dst->isTied());
   1039     assert(Opcode == static_cast<unsigned int>(SDWAOpcode));
   1040     // We also expect a vdst, since sdst can't preserve.
   1041     auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
   1042     assert(PreserveDstIdx != -1);
   1043 
   1044     auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx);
   1045     auto Tied = MI.getOperand(TiedIdx);
   1046 
   1047     SDWAInst.add(Tied);
   1048     SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
   1049   }
   1050 
   1051   // Apply all sdwa operand patterns.
   1052   bool Converted = false;
   1053   for (auto &Operand : SDWAOperands) {
   1054     LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
   1055     // There should be no intesection between SDWA operands and potential MIs
   1056     // e.g.:
   1057     // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
   1058     // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
   1059     // v_add_u32 v3, v4, v2
   1060     //
   1061     // In that example it is possible that we would fold 2nd instruction into 3rd
   1062     // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was
   1063     // already destroyed). So if SDWAOperand is also a potential MI then do not
   1064     // apply it.
   1065     if (PotentialMatches.count(Operand->getParentInst()) == 0)
   1066       Converted |= Operand->convertToSDWA(*SDWAInst, TII);
   1067   }
   1068   if (Converted) {
   1069     ConvertedInstructions.push_back(SDWAInst);
   1070   } else {
   1071     SDWAInst->eraseFromParent();
   1072     return false;
   1073   }
   1074 
   1075   LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
   1076   ++NumSDWAInstructionsPeepholed;
   1077 
   1078   MI.eraseFromParent();
   1079   return true;
   1080 }
   1081 
   1082 // If an instruction was converted to SDWA it should not have immediates or SGPR
   1083 // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
   1084 void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
   1085                                             const GCNSubtarget &ST) const {
   1086   const MCInstrDesc &Desc = TII->get(MI.getOpcode());
   1087   unsigned ConstantBusCount = 0;
   1088   for (MachineOperand &Op : MI.explicit_uses()) {
   1089     if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
   1090       continue;
   1091 
   1092     unsigned I = MI.getOperandNo(&Op);
   1093     if (Desc.OpInfo[I].RegClass == -1 ||
   1094        !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass)))
   1095       continue;
   1096 
   1097     if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
   1098         TRI->isSGPRReg(*MRI, Op.getReg())) {
   1099       ++ConstantBusCount;
   1100       continue;
   1101     }
   1102 
   1103     unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   1104     auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
   1105                         TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
   1106     if (Op.isImm())
   1107       Copy.addImm(Op.getImm());
   1108     else if (Op.isReg())
   1109       Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0,
   1110                   Op.getSubReg());
   1111     Op.ChangeToRegister(VGPR, false);
   1112   }
   1113 }
   1114 
   1115 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
   1116   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   1117 
   1118   if (!ST.hasSDWA() || skipFunction(MF.getFunction()))
   1119     return false;
   1120 
   1121   MRI = &MF.getRegInfo();
   1122   TRI = ST.getRegisterInfo();
   1123   TII = ST.getInstrInfo();
   1124 
   1125   // Find all SDWA operands in MF.
   1126   bool Ret = false;
   1127   for (MachineBasicBlock &MBB : MF) {
   1128     bool Changed = false;
   1129     do {
   1130       matchSDWAOperands(MBB);
   1131 
   1132       for (const auto &OperandPair : SDWAOperands) {
   1133         const auto &Operand = OperandPair.second;
   1134         MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
   1135         if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
   1136           PotentialMatches[PotentialMI].push_back(Operand.get());
   1137         }
   1138       }
   1139 
   1140       for (auto &PotentialPair : PotentialMatches) {
   1141         MachineInstr &PotentialMI = *PotentialPair.first;
   1142         convertToSDWA(PotentialMI, PotentialPair.second);
   1143       }
   1144 
   1145       PotentialMatches.clear();
   1146       SDWAOperands.clear();
   1147 
   1148       Changed = !ConvertedInstructions.empty();
   1149 
   1150       if (Changed)
   1151         Ret = true;
   1152       while (!ConvertedInstructions.empty())
   1153         legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
   1154     } while (Changed);
   1155   }
   1156 
   1157   return Ret;
   1158 }
   1159