1 //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file This pass tries to apply several peephole SDWA patterns. 11 /// 12 /// E.g. original: 13 /// V_LSHRREV_B32_e32 %0, 16, %1 14 /// V_ADD_I32_e32 %2, %0, %3 15 /// V_LSHLREV_B32_e32 %4, 16, %2 16 /// 17 /// Replace: 18 /// V_ADD_I32_sdwa %4, %1, %3 19 /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 20 /// 21 //===----------------------------------------------------------------------===// 22 23 #include "AMDGPU.h" 24 #include "AMDGPUSubtarget.h" 25 #include "SIDefines.h" 26 #include "SIInstrInfo.h" 27 #include "SIRegisterInfo.h" 28 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 29 #include "Utils/AMDGPUBaseInfo.h" 30 #include "llvm/ADT/None.h" 31 #include "llvm/ADT/Optional.h" 32 #include "llvm/ADT/STLExtras.h" 33 #include "llvm/ADT/SmallVector.h" 34 #include "llvm/ADT/Statistic.h" 35 #include "llvm/CodeGen/MachineBasicBlock.h" 36 #include "llvm/CodeGen/MachineFunction.h" 37 #include "llvm/CodeGen/MachineFunctionPass.h" 38 #include "llvm/CodeGen/MachineInstr.h" 39 #include "llvm/CodeGen/MachineInstrBuilder.h" 40 #include "llvm/CodeGen/MachineOperand.h" 41 #include "llvm/CodeGen/MachineRegisterInfo.h" 42 #include "llvm/CodeGen/TargetRegisterInfo.h" 43 #include "llvm/Config/llvm-config.h" 44 #include "llvm/MC/LaneBitmask.h" 45 #include "llvm/MC/MCInstrDesc.h" 46 #include "llvm/Pass.h" 47 #include "llvm/Support/Debug.h" 48 #include "llvm/Support/raw_ostream.h" 49 #include <algorithm> 50 #include <cassert> 51 #include <cstdint> 52 #include <memory> 53 #include <unordered_map> 54 55 using namespace llvm; 56 57 #define DEBUG_TYPE "si-peephole-sdwa" 58 59 STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found."); 60 STATISTIC(NumSDWAInstructionsPeepholed, 61 "Number of instruction converted to SDWA."); 62 63 namespace { 64 65 class SDWAOperand; 66 class SDWADstOperand; 67 68 class SIPeepholeSDWA : public MachineFunctionPass { 69 public: 70 using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; 71 72 private: 73 MachineRegisterInfo *MRI; 74 const SIRegisterInfo *TRI; 75 const SIInstrInfo *TII; 76 77 std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; 78 std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches; 79 SmallVector<MachineInstr *, 8> ConvertedInstructions; 80 81 Optional<int64_t> foldToImm(const MachineOperand &Op) const; 82 83 public: 84 static char ID; 85 86 SIPeepholeSDWA() : MachineFunctionPass(ID) { 87 initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); 88 } 89 90 bool runOnMachineFunction(MachineFunction &MF) override; 91 void matchSDWAOperands(MachineBasicBlock &MBB); 92 std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); 93 bool isConvertibleToSDWA(const MachineInstr &MI, const GCNSubtarget &ST) const; 94 bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); 95 void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; 96 97 StringRef getPassName() const override { return "SI Peephole SDWA"; } 98 99 void getAnalysisUsage(AnalysisUsage &AU) const override { 100 AU.setPreservesCFG(); 101 MachineFunctionPass::getAnalysisUsage(AU); 102 } 103 }; 104 105 class SDWAOperand { 106 private: 107 MachineOperand *Target; // Operand that would be used in converted instruction 108 MachineOperand *Replaced; // Operand that would be replace by Target 109 110 public: 111 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) 112 : Target(TargetOp), Replaced(ReplacedOp) { 113 assert(Target->isReg()); 114 assert(Replaced->isReg()); 115 } 116 117 virtual ~SDWAOperand() = default; 118 119 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0; 120 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; 121 122 MachineOperand *getTargetOperand() const { return Target; } 123 MachineOperand *getReplacedOperand() const { return Replaced; } 124 MachineInstr *getParentInst() const { return Target->getParent(); } 125 126 MachineRegisterInfo *getMRI() const { 127 return &getParentInst()->getParent()->getParent()->getRegInfo(); 128 } 129 130 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 131 virtual void print(raw_ostream& OS) const = 0; 132 void dump() const { print(dbgs()); } 133 #endif 134 }; 135 136 using namespace AMDGPU::SDWA; 137 138 class SDWASrcOperand : public SDWAOperand { 139 private: 140 SdwaSel SrcSel; 141 bool Abs; 142 bool Neg; 143 bool Sext; 144 145 public: 146 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 147 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, 148 bool Sext_ = false) 149 : SDWAOperand(TargetOp, ReplacedOp), 150 SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {} 151 152 MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 153 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 154 155 SdwaSel getSrcSel() const { return SrcSel; } 156 bool getAbs() const { return Abs; } 157 bool getNeg() const { return Neg; } 158 bool getSext() const { return Sext; } 159 160 uint64_t getSrcMods(const SIInstrInfo *TII, 161 const MachineOperand *SrcOp) const; 162 163 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 164 void print(raw_ostream& OS) const override; 165 #endif 166 }; 167 168 class SDWADstOperand : public SDWAOperand { 169 private: 170 SdwaSel DstSel; 171 DstUnused DstUn; 172 173 public: 174 175 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 176 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) 177 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} 178 179 MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; 180 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 181 182 SdwaSel getDstSel() const { return DstSel; } 183 DstUnused getDstUnused() const { return DstUn; } 184 185 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 186 void print(raw_ostream& OS) const override; 187 #endif 188 }; 189 190 class SDWADstPreserveOperand : public SDWADstOperand { 191 private: 192 MachineOperand *Preserve; 193 194 public: 195 SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, 196 MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD) 197 : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE), 198 Preserve(PreserveOp) {} 199 200 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; 201 202 MachineOperand *getPreservedOperand() const { return Preserve; } 203 204 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 205 void print(raw_ostream& OS) const override; 206 #endif 207 }; 208 209 } // end anonymous namespace 210 211 INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false) 212 213 char SIPeepholeSDWA::ID = 0; 214 215 char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; 216 217 FunctionPass *llvm::createSIPeepholeSDWAPass() { 218 return new SIPeepholeSDWA(); 219 } 220 221 222 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 223 static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) { 224 switch(Sel) { 225 case BYTE_0: OS << "BYTE_0"; break; 226 case BYTE_1: OS << "BYTE_1"; break; 227 case BYTE_2: OS << "BYTE_2"; break; 228 case BYTE_3: OS << "BYTE_3"; break; 229 case WORD_0: OS << "WORD_0"; break; 230 case WORD_1: OS << "WORD_1"; break; 231 case DWORD: OS << "DWORD"; break; 232 } 233 return OS; 234 } 235 236 static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { 237 switch(Un) { 238 case UNUSED_PAD: OS << "UNUSED_PAD"; break; 239 case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; 240 case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; 241 } 242 return OS; 243 } 244 245 static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { 246 Operand.print(OS); 247 return OS; 248 } 249 250 LLVM_DUMP_METHOD 251 void SDWASrcOperand::print(raw_ostream& OS) const { 252 OS << "SDWA src: " << *getTargetOperand() 253 << " src_sel:" << getSrcSel() 254 << " abs:" << getAbs() << " neg:" << getNeg() 255 << " sext:" << getSext() << '\n'; 256 } 257 258 LLVM_DUMP_METHOD 259 void SDWADstOperand::print(raw_ostream& OS) const { 260 OS << "SDWA dst: " << *getTargetOperand() 261 << " dst_sel:" << getDstSel() 262 << " dst_unused:" << getDstUnused() << '\n'; 263 } 264 265 LLVM_DUMP_METHOD 266 void SDWADstPreserveOperand::print(raw_ostream& OS) const { 267 OS << "SDWA preserve dst: " << *getTargetOperand() 268 << " dst_sel:" << getDstSel() 269 << " preserve:" << *getPreservedOperand() << '\n'; 270 } 271 272 #endif 273 274 static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { 275 assert(To.isReg() && From.isReg()); 276 To.setReg(From.getReg()); 277 To.setSubReg(From.getSubReg()); 278 To.setIsUndef(From.isUndef()); 279 if (To.isUse()) { 280 To.setIsKill(From.isKill()); 281 } else { 282 To.setIsDead(From.isDead()); 283 } 284 } 285 286 static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { 287 return LHS.isReg() && 288 RHS.isReg() && 289 LHS.getReg() == RHS.getReg() && 290 LHS.getSubReg() == RHS.getSubReg(); 291 } 292 293 static MachineOperand *findSingleRegUse(const MachineOperand *Reg, 294 const MachineRegisterInfo *MRI) { 295 if (!Reg->isReg() || !Reg->isDef()) 296 return nullptr; 297 298 MachineOperand *ResMO = nullptr; 299 for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) { 300 // If there exist use of subreg of Reg then return nullptr 301 if (!isSameReg(UseMO, *Reg)) 302 return nullptr; 303 304 // Check that there is only one instruction that uses Reg 305 if (!ResMO) { 306 ResMO = &UseMO; 307 } else if (ResMO->getParent() != UseMO.getParent()) { 308 return nullptr; 309 } 310 } 311 312 return ResMO; 313 } 314 315 static MachineOperand *findSingleRegDef(const MachineOperand *Reg, 316 const MachineRegisterInfo *MRI) { 317 if (!Reg->isReg()) 318 return nullptr; 319 320 MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg()); 321 if (!DefInstr) 322 return nullptr; 323 324 for (auto &DefMO : DefInstr->defs()) { 325 if (DefMO.isReg() && DefMO.getReg() == Reg->getReg()) 326 return &DefMO; 327 } 328 329 // Ignore implicit defs. 330 return nullptr; 331 } 332 333 uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, 334 const MachineOperand *SrcOp) const { 335 uint64_t Mods = 0; 336 const auto *MI = SrcOp->getParent(); 337 if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) { 338 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { 339 Mods = Mod->getImm(); 340 } 341 } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) { 342 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) { 343 Mods = Mod->getImm(); 344 } 345 } 346 if (Abs || Neg) { 347 assert(!Sext && 348 "Float and integer src modifiers can't be set simulteniously"); 349 Mods |= Abs ? SISrcMods::ABS : 0; 350 Mods ^= Neg ? SISrcMods::NEG : 0; 351 } else if (Sext) { 352 Mods |= SISrcMods::SEXT; 353 } 354 355 return Mods; 356 } 357 358 MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { 359 // For SDWA src operand potential instruction is one that use register 360 // defined by parent instruction 361 MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI()); 362 if (!PotentialMO) 363 return nullptr; 364 365 return PotentialMO->getParent(); 366 } 367 368 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 369 // Find operand in instruction that matches source operand and replace it with 370 // target operand. Set corresponding src_sel 371 bool IsPreserveSrc = false; 372 MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 373 MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 374 MachineOperand *SrcMods = 375 TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 376 assert(Src && (Src->isReg() || Src->isImm())); 377 if (!isSameReg(*Src, *getReplacedOperand())) { 378 // If this is not src0 then it could be src1 379 Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 380 SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 381 SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 382 383 if (!Src || 384 !isSameReg(*Src, *getReplacedOperand())) { 385 // It's possible this Src is a tied operand for 386 // UNUSED_PRESERVE, in which case we can either 387 // abandon the peephole attempt, or if legal we can 388 // copy the target operand into the tied slot 389 // if the preserve operation will effectively cause the same 390 // result by overwriting the rest of the dst. 391 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 392 MachineOperand *DstUnused = 393 TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 394 395 if (Dst && 396 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 397 // This will work if the tied src is acessing WORD_0, and the dst is 398 // writing WORD_1. Modifiers don't matter because all the bits that 399 // would be impacted are being overwritten by the dst. 400 // Any other case will not work. 401 SdwaSel DstSel = static_cast<SdwaSel>( 402 TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel)); 403 if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 && 404 getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) { 405 IsPreserveSrc = true; 406 auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 407 AMDGPU::OpName::vdst); 408 auto TiedIdx = MI.findTiedOperandIdx(DstIdx); 409 Src = &MI.getOperand(TiedIdx); 410 SrcSel = nullptr; 411 SrcMods = nullptr; 412 } else { 413 // Not legal to convert this src 414 return false; 415 } 416 } 417 } 418 assert(Src && Src->isReg()); 419 420 if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 421 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 422 !isSameReg(*Src, *getReplacedOperand())) { 423 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to 424 // src2. This is not allowed. 425 return false; 426 } 427 428 assert(isSameReg(*Src, *getReplacedOperand()) && 429 (IsPreserveSrc || (SrcSel && SrcMods))); 430 } 431 copyRegOperand(*Src, *getTargetOperand()); 432 if (!IsPreserveSrc) { 433 SrcSel->setImm(getSrcSel()); 434 SrcMods->setImm(getSrcMods(TII, Src)); 435 } 436 getTargetOperand()->setIsKill(false); 437 return true; 438 } 439 440 MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { 441 // For SDWA dst operand potential instruction is one that defines register 442 // that this operand uses 443 MachineRegisterInfo *MRI = getMRI(); 444 MachineInstr *ParentMI = getParentInst(); 445 446 MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI); 447 if (!PotentialMO) 448 return nullptr; 449 450 // Check that ParentMI is the only instruction that uses replaced register 451 for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) { 452 if (&UseInst != ParentMI) 453 return nullptr; 454 } 455 456 return PotentialMO->getParent(); 457 } 458 459 bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { 460 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused 461 462 if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || 463 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && 464 getDstSel() != AMDGPU::SDWA::DWORD) { 465 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD 466 return false; 467 } 468 469 MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 470 assert(Operand && 471 Operand->isReg() && 472 isSameReg(*Operand, *getReplacedOperand())); 473 copyRegOperand(*Operand, *getTargetOperand()); 474 MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 475 assert(DstSel); 476 DstSel->setImm(getDstSel()); 477 MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 478 assert(DstUnused); 479 DstUnused->setImm(getDstUnused()); 480 481 // Remove original instruction because it would conflict with our new 482 // instruction by register definition 483 getParentInst()->eraseFromParent(); 484 return true; 485 } 486 487 bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, 488 const SIInstrInfo *TII) { 489 // MI should be moved right before v_or_b32. 490 // For this we should clear all kill flags on uses of MI src-operands or else 491 // we can encounter problem with use of killed operand. 492 for (MachineOperand &MO : MI.uses()) { 493 if (!MO.isReg()) 494 continue; 495 getMRI()->clearKillFlags(MO.getReg()); 496 } 497 498 // Move MI before v_or_b32 499 auto MBB = MI.getParent(); 500 MBB->remove(&MI); 501 MBB->insert(getParentInst(), &MI); 502 503 // Add Implicit use of preserved register 504 MachineInstrBuilder MIB(*MBB->getParent(), MI); 505 MIB.addReg(getPreservedOperand()->getReg(), 506 RegState::ImplicitKill, 507 getPreservedOperand()->getSubReg()); 508 509 // Tie dst to implicit use 510 MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst), 511 MI.getNumOperands() - 1); 512 513 // Convert MI as any other SDWADstOperand and remove v_or_b32 514 return SDWADstOperand::convertToSDWA(MI, TII); 515 } 516 517 Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { 518 if (Op.isImm()) { 519 return Op.getImm(); 520 } 521 522 // If this is not immediate then it can be copy of immediate value, e.g.: 523 // %1 = S_MOV_B32 255; 524 if (Op.isReg()) { 525 for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { 526 if (!isSameReg(Op, Def)) 527 continue; 528 529 const MachineInstr *DefInst = Def.getParent(); 530 if (!TII->isFoldableCopy(*DefInst)) 531 return None; 532 533 const MachineOperand &Copied = DefInst->getOperand(1); 534 if (!Copied.isImm()) 535 return None; 536 537 return Copied.getImm(); 538 } 539 } 540 541 return None; 542 } 543 544 std::unique_ptr<SDWAOperand> 545 SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { 546 unsigned Opcode = MI.getOpcode(); 547 switch (Opcode) { 548 case AMDGPU::V_LSHRREV_B32_e32: 549 case AMDGPU::V_ASHRREV_I32_e32: 550 case AMDGPU::V_LSHLREV_B32_e32: 551 case AMDGPU::V_LSHRREV_B32_e64: 552 case AMDGPU::V_ASHRREV_I32_e64: 553 case AMDGPU::V_LSHLREV_B32_e64: { 554 // from: v_lshrrev_b32_e32 v1, 16/24, v0 555 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 556 557 // from: v_ashrrev_i32_e32 v1, 16/24, v0 558 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 559 560 // from: v_lshlrev_b32_e32 v1, 16/24, v0 561 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD 562 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 563 auto Imm = foldToImm(*Src0); 564 if (!Imm) 565 break; 566 567 if (*Imm != 16 && *Imm != 24) 568 break; 569 570 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 571 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 572 if (TRI->isPhysicalRegister(Src1->getReg()) || 573 TRI->isPhysicalRegister(Dst->getReg())) 574 break; 575 576 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || 577 Opcode == AMDGPU::V_LSHLREV_B32_e64) { 578 return make_unique<SDWADstOperand>( 579 Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); 580 } else { 581 return make_unique<SDWASrcOperand>( 582 Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, 583 Opcode != AMDGPU::V_LSHRREV_B32_e32 && 584 Opcode != AMDGPU::V_LSHRREV_B32_e64); 585 } 586 break; 587 } 588 589 case AMDGPU::V_LSHRREV_B16_e32: 590 case AMDGPU::V_ASHRREV_I16_e32: 591 case AMDGPU::V_LSHLREV_B16_e32: 592 case AMDGPU::V_LSHRREV_B16_e64: 593 case AMDGPU::V_ASHRREV_I16_e64: 594 case AMDGPU::V_LSHLREV_B16_e64: { 595 // from: v_lshrrev_b16_e32 v1, 8, v0 596 // to SDWA src:v0 src_sel:BYTE_1 597 598 // from: v_ashrrev_i16_e32 v1, 8, v0 599 // to SDWA src:v0 src_sel:BYTE_1 sext:1 600 601 // from: v_lshlrev_b16_e32 v1, 8, v0 602 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD 603 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 604 auto Imm = foldToImm(*Src0); 605 if (!Imm || *Imm != 8) 606 break; 607 608 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 609 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 610 611 if (TRI->isPhysicalRegister(Src1->getReg()) || 612 TRI->isPhysicalRegister(Dst->getReg())) 613 break; 614 615 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || 616 Opcode == AMDGPU::V_LSHLREV_B16_e64) { 617 return make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); 618 } else { 619 return make_unique<SDWASrcOperand>( 620 Src1, Dst, BYTE_1, false, false, 621 Opcode != AMDGPU::V_LSHRREV_B16_e32 && 622 Opcode != AMDGPU::V_LSHRREV_B16_e64); 623 } 624 break; 625 } 626 627 case AMDGPU::V_BFE_I32: 628 case AMDGPU::V_BFE_U32: { 629 // e.g.: 630 // from: v_bfe_u32 v1, v0, 8, 8 631 // to SDWA src:v0 src_sel:BYTE_1 632 633 // offset | width | src_sel 634 // ------------------------ 635 // 0 | 8 | BYTE_0 636 // 0 | 16 | WORD_0 637 // 0 | 32 | DWORD ? 638 // 8 | 8 | BYTE_1 639 // 16 | 8 | BYTE_2 640 // 16 | 16 | WORD_1 641 // 24 | 8 | BYTE_3 642 643 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 644 auto Offset = foldToImm(*Src1); 645 if (!Offset) 646 break; 647 648 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 649 auto Width = foldToImm(*Src2); 650 if (!Width) 651 break; 652 653 SdwaSel SrcSel = DWORD; 654 655 if (*Offset == 0 && *Width == 8) 656 SrcSel = BYTE_0; 657 else if (*Offset == 0 && *Width == 16) 658 SrcSel = WORD_0; 659 else if (*Offset == 0 && *Width == 32) 660 SrcSel = DWORD; 661 else if (*Offset == 8 && *Width == 8) 662 SrcSel = BYTE_1; 663 else if (*Offset == 16 && *Width == 8) 664 SrcSel = BYTE_2; 665 else if (*Offset == 16 && *Width == 16) 666 SrcSel = WORD_1; 667 else if (*Offset == 24 && *Width == 8) 668 SrcSel = BYTE_3; 669 else 670 break; 671 672 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 673 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 674 675 if (TRI->isPhysicalRegister(Src0->getReg()) || 676 TRI->isPhysicalRegister(Dst->getReg())) 677 break; 678 679 return make_unique<SDWASrcOperand>( 680 Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32); 681 } 682 683 case AMDGPU::V_AND_B32_e32: 684 case AMDGPU::V_AND_B32_e64: { 685 // e.g.: 686 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 687 // to SDWA src:v0 src_sel:WORD_0/BYTE_0 688 689 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 690 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 691 auto ValSrc = Src1; 692 auto Imm = foldToImm(*Src0); 693 694 if (!Imm) { 695 Imm = foldToImm(*Src1); 696 ValSrc = Src0; 697 } 698 699 if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) 700 break; 701 702 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 703 704 if (TRI->isPhysicalRegister(ValSrc->getReg()) || 705 TRI->isPhysicalRegister(Dst->getReg())) 706 break; 707 708 return make_unique<SDWASrcOperand>( 709 ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); 710 } 711 712 case AMDGPU::V_OR_B32_e32: 713 case AMDGPU::V_OR_B32_e64: { 714 // Patterns for dst_unused:UNUSED_PRESERVE. 715 // e.g., from: 716 // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD 717 // src1_sel:WORD_1 src2_sel:WORD1 718 // v_add_f16_e32 v3, v1, v2 719 // v_or_b32_e32 v4, v0, v3 720 // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3 721 722 // Check if one of operands of v_or_b32 is SDWA instruction 723 using CheckRetType = Optional<std::pair<MachineOperand *, MachineOperand *>>; 724 auto CheckOROperandsForSDWA = 725 [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType { 726 if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg()) 727 return CheckRetType(None); 728 729 MachineOperand *Op1Def = findSingleRegDef(Op1, MRI); 730 if (!Op1Def) 731 return CheckRetType(None); 732 733 MachineInstr *Op1Inst = Op1Def->getParent(); 734 if (!TII->isSDWA(*Op1Inst)) 735 return CheckRetType(None); 736 737 MachineOperand *Op2Def = findSingleRegDef(Op2, MRI); 738 if (!Op2Def) 739 return CheckRetType(None); 740 741 return CheckRetType(std::make_pair(Op1Def, Op2Def)); 742 }; 743 744 MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 745 MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 746 assert(OrSDWA && OrOther); 747 auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 748 if (!Res) { 749 OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 750 OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 751 assert(OrSDWA && OrOther); 752 Res = CheckOROperandsForSDWA(OrSDWA, OrOther); 753 if (!Res) 754 break; 755 } 756 757 MachineOperand *OrSDWADef = Res->first; 758 MachineOperand *OrOtherDef = Res->second; 759 assert(OrSDWADef && OrOtherDef); 760 761 MachineInstr *SDWAInst = OrSDWADef->getParent(); 762 MachineInstr *OtherInst = OrOtherDef->getParent(); 763 764 // Check that OtherInstr is actually bitwise compatible with SDWAInst = their 765 // destination patterns don't overlap. Compatible instruction can be either 766 // regular instruction with compatible bitness or SDWA instruction with 767 // correct dst_sel 768 // SDWAInst | OtherInst bitness / OtherInst dst_sel 769 // ----------------------------------------------------- 770 // DWORD | no / no 771 // WORD_0 | no / BYTE_2/3, WORD_1 772 // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0 773 // BYTE_0 | no / BYTE_1/2/3, WORD_1 774 // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1 775 // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0 776 // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0 777 // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK 778 // but v_add_f32 is not. 779 780 // TODO: add support for non-SDWA instructions as OtherInst. 781 // For now this only works with SDWA instructions. For regular instructions 782 // there is no way to determine if the instruction writes only 8/16/24-bit 783 // out of full register size and all registers are at min 32-bit wide. 784 if (!TII->isSDWA(*OtherInst)) 785 break; 786 787 SdwaSel DstSel = static_cast<SdwaSel>( 788 TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));; 789 SdwaSel OtherDstSel = static_cast<SdwaSel>( 790 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel)); 791 792 bool DstSelAgree = false; 793 switch (DstSel) { 794 case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) || 795 (OtherDstSel == BYTE_3) || 796 (OtherDstSel == WORD_1)); 797 break; 798 case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 799 (OtherDstSel == BYTE_1) || 800 (OtherDstSel == WORD_0)); 801 break; 802 case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) || 803 (OtherDstSel == BYTE_2) || 804 (OtherDstSel == BYTE_3) || 805 (OtherDstSel == WORD_1)); 806 break; 807 case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) || 808 (OtherDstSel == BYTE_2) || 809 (OtherDstSel == BYTE_3) || 810 (OtherDstSel == WORD_1)); 811 break; 812 case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) || 813 (OtherDstSel == BYTE_1) || 814 (OtherDstSel == BYTE_3) || 815 (OtherDstSel == WORD_0)); 816 break; 817 case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) || 818 (OtherDstSel == BYTE_1) || 819 (OtherDstSel == BYTE_2) || 820 (OtherDstSel == WORD_0)); 821 break; 822 default: DstSelAgree = false; 823 } 824 825 if (!DstSelAgree) 826 break; 827 828 // Also OtherInst dst_unused should be UNUSED_PAD 829 DstUnused OtherDstUnused = static_cast<DstUnused>( 830 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused)); 831 if (OtherDstUnused != DstUnused::UNUSED_PAD) 832 break; 833 834 // Create DstPreserveOperand 835 MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 836 assert(OrDst && OrDst->isReg()); 837 838 return make_unique<SDWADstPreserveOperand>( 839 OrDst, OrSDWADef, OrOtherDef, DstSel); 840 841 } 842 } 843 844 return std::unique_ptr<SDWAOperand>(nullptr); 845 } 846 847 void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { 848 for (MachineInstr &MI : MBB) { 849 if (auto Operand = matchSDWAOperand(MI)) { 850 LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n'); 851 SDWAOperands[&MI] = std::move(Operand); 852 ++NumSDWAPatternsFound; 853 } 854 } 855 } 856 857 bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI, 858 const GCNSubtarget &ST) const { 859 // Check if this is already an SDWA instruction 860 unsigned Opc = MI.getOpcode(); 861 if (TII->isSDWA(Opc)) 862 return true; 863 864 // Check if this instruction has opcode that supports SDWA 865 if (AMDGPU::getSDWAOp(Opc) == -1) 866 Opc = AMDGPU::getVOPe32(Opc); 867 868 if (AMDGPU::getSDWAOp(Opc) == -1) 869 return false; 870 871 if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) 872 return false; 873 874 if (TII->isVOPC(Opc)) { 875 if (!ST.hasSDWASdst()) { 876 const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 877 if (SDst && SDst->getReg() != AMDGPU::VCC) 878 return false; 879 } 880 881 if (!ST.hasSDWAOutModsVOPC() && 882 (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) || 883 TII->hasModifiersSet(MI, AMDGPU::OpName::omod))) 884 return false; 885 886 } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) || 887 !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 888 return false; 889 } 890 891 if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 || 892 Opc == AMDGPU::V_MAC_F32_e32)) 893 return false; 894 895 // FIXME: has SDWA but require handling of implicit VCC use 896 if (Opc == AMDGPU::V_CNDMASK_B32_e32) 897 return false; 898 899 return true; 900 } 901 902 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, 903 const SDWAOperandsVector &SDWAOperands) { 904 905 LLVM_DEBUG(dbgs() << "Convert instruction:" << MI); 906 907 // Convert to sdwa 908 int SDWAOpcode; 909 unsigned Opcode = MI.getOpcode(); 910 if (TII->isSDWA(Opcode)) { 911 SDWAOpcode = Opcode; 912 } else { 913 SDWAOpcode = AMDGPU::getSDWAOp(Opcode); 914 if (SDWAOpcode == -1) 915 SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode)); 916 } 917 assert(SDWAOpcode != -1); 918 919 const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); 920 921 // Create SDWA version of instruction MI and initialize its operands 922 MachineInstrBuilder SDWAInst = 923 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc); 924 925 // Copy dst, if it is present in original then should also be present in SDWA 926 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); 927 if (Dst) { 928 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1); 929 SDWAInst.add(*Dst); 930 } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) { 931 assert(Dst && 932 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); 933 SDWAInst.add(*Dst); 934 } else { 935 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1); 936 SDWAInst.addReg(AMDGPU::VCC, RegState::Define); 937 } 938 939 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and 940 // src0_modifiers (except for v_nop_sdwa, but it can't get here) 941 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 942 assert( 943 Src0 && 944 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 && 945 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1); 946 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) 947 SDWAInst.addImm(Mod->getImm()); 948 else 949 SDWAInst.addImm(0); 950 SDWAInst.add(*Src0); 951 952 // Copy src1 if present, initialize src1_modifiers. 953 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); 954 if (Src1) { 955 assert( 956 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 && 957 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1); 958 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)) 959 SDWAInst.addImm(Mod->getImm()); 960 else 961 SDWAInst.addImm(0); 962 SDWAInst.add(*Src1); 963 } 964 965 if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || 966 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { 967 // v_mac_f16/32 has additional src2 operand tied to vdst 968 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); 969 assert(Src2); 970 SDWAInst.add(*Src2); 971 } 972 973 // Copy clamp if present, initialize otherwise 974 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1); 975 MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp); 976 if (Clamp) { 977 SDWAInst.add(*Clamp); 978 } else { 979 SDWAInst.addImm(0); 980 } 981 982 // Copy omod if present, initialize otherwise if needed 983 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) { 984 MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); 985 if (OMod) { 986 SDWAInst.add(*OMod); 987 } else { 988 SDWAInst.addImm(0); 989 } 990 } 991 992 // Copy dst_sel if present, initialize otherwise if needed 993 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) { 994 MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); 995 if (DstSel) { 996 SDWAInst.add(*DstSel); 997 } else { 998 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 999 } 1000 } 1001 1002 // Copy dst_unused if present, initialize otherwise if needed 1003 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) { 1004 MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 1005 if (DstUnused) { 1006 SDWAInst.add(*DstUnused); 1007 } else { 1008 SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); 1009 } 1010 } 1011 1012 // Copy src0_sel if present, initialize otherwise 1013 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1); 1014 MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); 1015 if (Src0Sel) { 1016 SDWAInst.add(*Src0Sel); 1017 } else { 1018 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1019 } 1020 1021 // Copy src1_sel if present, initialize otherwise if needed 1022 if (Src1) { 1023 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1); 1024 MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); 1025 if (Src1Sel) { 1026 SDWAInst.add(*Src1Sel); 1027 } else { 1028 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); 1029 } 1030 } 1031 1032 // Check for a preserved register that needs to be copied. 1033 auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); 1034 if (DstUnused && 1035 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { 1036 // We expect, if we are here, that the instruction was already in it's SDWA form, 1037 // with a tied operand. 1038 assert(Dst && Dst->isTied()); 1039 assert(Opcode == static_cast<unsigned int>(SDWAOpcode)); 1040 // We also expect a vdst, since sdst can't preserve. 1041 auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst); 1042 assert(PreserveDstIdx != -1); 1043 1044 auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx); 1045 auto Tied = MI.getOperand(TiedIdx); 1046 1047 SDWAInst.add(Tied); 1048 SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1); 1049 } 1050 1051 // Apply all sdwa operand patterns. 1052 bool Converted = false; 1053 for (auto &Operand : SDWAOperands) { 1054 LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand); 1055 // There should be no intesection between SDWA operands and potential MIs 1056 // e.g.: 1057 // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 1058 // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 1059 // v_add_u32 v3, v4, v2 1060 // 1061 // In that example it is possible that we would fold 2nd instruction into 3rd 1062 // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was 1063 // already destroyed). So if SDWAOperand is also a potential MI then do not 1064 // apply it. 1065 if (PotentialMatches.count(Operand->getParentInst()) == 0) 1066 Converted |= Operand->convertToSDWA(*SDWAInst, TII); 1067 } 1068 if (Converted) { 1069 ConvertedInstructions.push_back(SDWAInst); 1070 } else { 1071 SDWAInst->eraseFromParent(); 1072 return false; 1073 } 1074 1075 LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n'); 1076 ++NumSDWAInstructionsPeepholed; 1077 1078 MI.eraseFromParent(); 1079 return true; 1080 } 1081 1082 // If an instruction was converted to SDWA it should not have immediates or SGPR 1083 // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. 1084 void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, 1085 const GCNSubtarget &ST) const { 1086 const MCInstrDesc &Desc = TII->get(MI.getOpcode()); 1087 unsigned ConstantBusCount = 0; 1088 for (MachineOperand &Op : MI.explicit_uses()) { 1089 if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) 1090 continue; 1091 1092 unsigned I = MI.getOperandNo(&Op); 1093 if (Desc.OpInfo[I].RegClass == -1 || 1094 !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass))) 1095 continue; 1096 1097 if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && 1098 TRI->isSGPRReg(*MRI, Op.getReg())) { 1099 ++ConstantBusCount; 1100 continue; 1101 } 1102 1103 unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1104 auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1105 TII->get(AMDGPU::V_MOV_B32_e32), VGPR); 1106 if (Op.isImm()) 1107 Copy.addImm(Op.getImm()); 1108 else if (Op.isReg()) 1109 Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0, 1110 Op.getSubReg()); 1111 Op.ChangeToRegister(VGPR, false); 1112 } 1113 } 1114 1115 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { 1116 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1117 1118 if (!ST.hasSDWA() || skipFunction(MF.getFunction())) 1119 return false; 1120 1121 MRI = &MF.getRegInfo(); 1122 TRI = ST.getRegisterInfo(); 1123 TII = ST.getInstrInfo(); 1124 1125 // Find all SDWA operands in MF. 1126 bool Ret = false; 1127 for (MachineBasicBlock &MBB : MF) { 1128 bool Changed = false; 1129 do { 1130 matchSDWAOperands(MBB); 1131 1132 for (const auto &OperandPair : SDWAOperands) { 1133 const auto &Operand = OperandPair.second; 1134 MachineInstr *PotentialMI = Operand->potentialToConvert(TII); 1135 if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { 1136 PotentialMatches[PotentialMI].push_back(Operand.get()); 1137 } 1138 } 1139 1140 for (auto &PotentialPair : PotentialMatches) { 1141 MachineInstr &PotentialMI = *PotentialPair.first; 1142 convertToSDWA(PotentialMI, PotentialPair.second); 1143 } 1144 1145 PotentialMatches.clear(); 1146 SDWAOperands.clear(); 1147 1148 Changed = !ConvertedInstructions.empty(); 1149 1150 if (Changed) 1151 Ret = true; 1152 while (!ConvertedInstructions.empty()) 1153 legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); 1154 } while (Changed); 1155 } 1156 1157 return Ret; 1158 } 1159