1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Insert wait instructions for memory reads and writes. 12 /// 13 /// Memory reads and writes are issued asynchronously, so we need to insert 14 /// S_WAITCNT instructions when we want to access any of their results or 15 /// overwrite any register that's used asynchronously. 16 // 17 //===----------------------------------------------------------------------===// 18 19 #include "AMDGPU.h" 20 #include "AMDGPUSubtarget.h" 21 #include "SIDefines.h" 22 #include "SIInstrInfo.h" 23 #include "SIMachineFunctionInfo.h" 24 #include "llvm/CodeGen/MachineFunction.h" 25 #include "llvm/CodeGen/MachineFunctionPass.h" 26 #include "llvm/CodeGen/MachineInstrBuilder.h" 27 #include "llvm/CodeGen/MachineRegisterInfo.h" 28 29 using namespace llvm; 30 31 namespace { 32 33 /// \brief One variable for each of the hardware counters 34 typedef union { 35 struct { 36 unsigned VM; 37 unsigned EXP; 38 unsigned LGKM; 39 } Named; 40 unsigned Array[3]; 41 42 } Counters; 43 44 typedef enum { 45 OTHER, 46 SMEM, 47 VMEM 48 } InstType; 49 50 typedef Counters RegCounters[512]; 51 typedef std::pair<unsigned, unsigned> RegInterval; 52 53 class SIInsertWaits : public MachineFunctionPass { 54 55 private: 56 static char ID; 57 const SIInstrInfo *TII; 58 const SIRegisterInfo *TRI; 59 const MachineRegisterInfo *MRI; 60 61 /// \brief Constant hardware limits 62 static const Counters WaitCounts; 63 64 /// \brief Constant zero value 65 static const Counters ZeroCounts; 66 67 /// \brief Counter values we have already waited on. 68 Counters WaitedOn; 69 70 /// \brief Counter values for last instruction issued. 71 Counters LastIssued; 72 73 /// \brief Registers used by async instructions. 74 RegCounters UsedRegs; 75 76 /// \brief Registers defined by async instructions. 77 RegCounters DefinedRegs; 78 79 /// \brief Different export instruction types seen since last wait. 80 unsigned ExpInstrTypesSeen; 81 82 /// \brief Type of the last opcode. 83 InstType LastOpcodeType; 84 85 bool LastInstWritesM0; 86 87 /// \brief Get increment/decrement amount for this instruction. 88 Counters getHwCounts(MachineInstr &MI); 89 90 /// \brief Is operand relevant for async execution? 91 bool isOpRelevant(MachineOperand &Op); 92 93 /// \brief Get register interval an operand affects. 94 RegInterval getRegInterval(const TargetRegisterClass *RC, 95 const MachineOperand &Reg) const; 96 97 /// \brief Handle instructions async components 98 void pushInstruction(MachineBasicBlock &MBB, 99 MachineBasicBlock::iterator I); 100 101 /// \brief Insert the actual wait instruction 102 bool insertWait(MachineBasicBlock &MBB, 103 MachineBasicBlock::iterator I, 104 const Counters &Counts); 105 106 /// \brief Do we need def2def checks? 107 bool unorderedDefines(MachineInstr &MI); 108 109 /// \brief Resolve all operand dependencies to counter requirements 110 Counters handleOperands(MachineInstr &MI); 111 112 /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG. 113 void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); 114 115 public: 116 SIInsertWaits(TargetMachine &tm) : 117 MachineFunctionPass(ID), 118 TII(nullptr), 119 TRI(nullptr), 120 ExpInstrTypesSeen(0) { } 121 122 bool runOnMachineFunction(MachineFunction &MF) override; 123 124 const char *getPassName() const override { 125 return "SI insert wait instructions"; 126 } 127 128 void getAnalysisUsage(AnalysisUsage &AU) const override { 129 AU.setPreservesCFG(); 130 MachineFunctionPass::getAnalysisUsage(AU); 131 } 132 }; 133 134 } // End anonymous namespace 135 136 char SIInsertWaits::ID = 0; 137 138 const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } }; 139 const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } }; 140 141 FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) { 142 return new SIInsertWaits(tm); 143 } 144 145 Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { 146 uint64_t TSFlags = MI.getDesc().TSFlags; 147 Counters Result = { { 0, 0, 0 } }; 148 149 Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT); 150 151 // Only consider stores or EXP for EXP_CNT 152 Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT && 153 (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore())); 154 155 // LGKM may uses larger values 156 if (TSFlags & SIInstrFlags::LGKM_CNT) { 157 158 if (TII->isSMRD(MI)) { 159 160 if (MI.getNumOperands() != 0) { 161 assert(MI.getOperand(0).isReg() && 162 "First LGKM operand must be a register!"); 163 164 // XXX - What if this is a write into a super register? 165 const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0); 166 unsigned Size = RC->getSize(); 167 Result.Named.LGKM = Size > 4 ? 2 : 1; 168 } else { 169 // s_dcache_inv etc. do not have a a destination register. Assume we 170 // want a wait on these. 171 // XXX - What is the right value? 172 Result.Named.LGKM = 1; 173 } 174 } else { 175 // DS 176 Result.Named.LGKM = 1; 177 } 178 179 } else { 180 Result.Named.LGKM = 0; 181 } 182 183 return Result; 184 } 185 186 bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { 187 // Constants are always irrelevant 188 if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) 189 return false; 190 191 // Defines are always relevant 192 if (Op.isDef()) 193 return true; 194 195 // For exports all registers are relevant 196 MachineInstr &MI = *Op.getParent(); 197 if (MI.getOpcode() == AMDGPU::EXP) 198 return true; 199 200 // For stores the stored value is also relevant 201 if (!MI.getDesc().mayStore()) 202 return false; 203 204 // Check if this operand is the value being stored. 205 // Special case for DS instructions, since the address 206 // operand comes before the value operand and it may have 207 // multiple data operands. 208 209 if (TII->isDS(MI)) { 210 MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data); 211 if (Data && Op.isIdenticalTo(*Data)) 212 return true; 213 214 MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0); 215 if (Data0 && Op.isIdenticalTo(*Data0)) 216 return true; 217 218 MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1); 219 if (Data1 && Op.isIdenticalTo(*Data1)) 220 return true; 221 222 return false; 223 } 224 225 // NOTE: This assumes that the value operand is before the 226 // address operand, and that there is only one value operand. 227 for (MachineInstr::mop_iterator I = MI.operands_begin(), 228 E = MI.operands_end(); I != E; ++I) { 229 230 if (I->isReg() && I->isUse()) 231 return Op.isIdenticalTo(*I); 232 } 233 234 return false; 235 } 236 237 RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC, 238 const MachineOperand &Reg) const { 239 unsigned Size = RC->getSize(); 240 assert(Size >= 4); 241 242 RegInterval Result; 243 Result.first = TRI->getEncodingValue(Reg.getReg()); 244 Result.second = Result.first + Size / 4; 245 246 return Result; 247 } 248 249 void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, 250 MachineBasicBlock::iterator I) { 251 252 // Get the hardware counter increments and sum them up 253 Counters Increment = getHwCounts(*I); 254 Counters Limit = ZeroCounts; 255 unsigned Sum = 0; 256 257 for (unsigned i = 0; i < 3; ++i) { 258 LastIssued.Array[i] += Increment.Array[i]; 259 if (Increment.Array[i]) 260 Limit.Array[i] = LastIssued.Array[i]; 261 Sum += Increment.Array[i]; 262 } 263 264 // If we don't increase anything then that's it 265 if (Sum == 0) { 266 LastOpcodeType = OTHER; 267 return; 268 } 269 270 if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >= 271 AMDGPUSubtarget::VOLCANIC_ISLANDS) { 272 // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM 273 // or SMEM clause, respectively. 274 // 275 // The temporary workaround is to break the clauses with S_NOP. 276 // 277 // The proper solution would be to allocate registers such that all source 278 // and destination registers don't overlap, e.g. this is illegal: 279 // r0 = load r2 280 // r2 = load r0 281 if ((LastOpcodeType == SMEM && TII->isSMRD(*I)) || 282 (LastOpcodeType == VMEM && Increment.Named.VM)) { 283 // Insert a NOP to break the clause. 284 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)) 285 .addImm(0); 286 LastInstWritesM0 = false; 287 } 288 289 if (TII->isSMRD(*I)) 290 LastOpcodeType = SMEM; 291 else if (Increment.Named.VM) 292 LastOpcodeType = VMEM; 293 } 294 295 // Remember which export instructions we have seen 296 if (Increment.Named.EXP) { 297 ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2; 298 } 299 300 for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { 301 MachineOperand &Op = I->getOperand(i); 302 if (!isOpRelevant(Op)) 303 continue; 304 305 const TargetRegisterClass *RC = TII->getOpRegClass(*I, i); 306 RegInterval Interval = getRegInterval(RC, Op); 307 for (unsigned j = Interval.first; j < Interval.second; ++j) { 308 309 // Remember which registers we define 310 if (Op.isDef()) 311 DefinedRegs[j] = Limit; 312 313 // and which one we are using 314 if (Op.isUse()) 315 UsedRegs[j] = Limit; 316 } 317 } 318 } 319 320 bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, 321 MachineBasicBlock::iterator I, 322 const Counters &Required) { 323 324 // End of program? No need to wait on anything 325 if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM) 326 return false; 327 328 // Figure out if the async instructions execute in order 329 bool Ordered[3]; 330 331 // VM_CNT is always ordered 332 Ordered[0] = true; 333 334 // EXP_CNT is unordered if we have both EXP & VM-writes 335 Ordered[1] = ExpInstrTypesSeen == 3; 336 337 // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS 338 Ordered[2] = false; 339 340 // The values we are going to put into the S_WAITCNT instruction 341 Counters Counts = WaitCounts; 342 343 // Do we really need to wait? 344 bool NeedWait = false; 345 346 for (unsigned i = 0; i < 3; ++i) { 347 348 if (Required.Array[i] <= WaitedOn.Array[i]) 349 continue; 350 351 NeedWait = true; 352 353 if (Ordered[i]) { 354 unsigned Value = LastIssued.Array[i] - Required.Array[i]; 355 356 // Adjust the value to the real hardware possibilities. 357 Counts.Array[i] = std::min(Value, WaitCounts.Array[i]); 358 359 } else 360 Counts.Array[i] = 0; 361 362 // Remember on what we have waited on. 363 WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i]; 364 } 365 366 if (!NeedWait) 367 return false; 368 369 // Reset EXP_CNT instruction types 370 if (Counts.Named.EXP == 0) 371 ExpInstrTypesSeen = 0; 372 373 // Build the wait instruction 374 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) 375 .addImm((Counts.Named.VM & 0xF) | 376 ((Counts.Named.EXP & 0x7) << 4) | 377 ((Counts.Named.LGKM & 0x7) << 8)); 378 379 LastOpcodeType = OTHER; 380 LastInstWritesM0 = false; 381 return true; 382 } 383 384 /// \brief helper function for handleOperands 385 static void increaseCounters(Counters &Dst, const Counters &Src) { 386 387 for (unsigned i = 0; i < 3; ++i) 388 Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]); 389 } 390 391 Counters SIInsertWaits::handleOperands(MachineInstr &MI) { 392 393 Counters Result = ZeroCounts; 394 395 // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, 396 // but we also want to wait for any other outstanding transfers before 397 // signalling other hardware blocks 398 if (MI.getOpcode() == AMDGPU::S_SENDMSG) 399 return LastIssued; 400 401 // For each register affected by this instruction increase the result 402 // sequence. 403 // 404 // TODO: We could probably just look at explicit operands if we removed VCC / 405 // EXEC from SMRD dest reg classes. 406 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 407 MachineOperand &Op = MI.getOperand(i); 408 if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) 409 continue; 410 411 const TargetRegisterClass *RC = TII->getOpRegClass(MI, i); 412 RegInterval Interval = getRegInterval(RC, Op); 413 for (unsigned j = Interval.first; j < Interval.second; ++j) { 414 415 if (Op.isDef()) { 416 increaseCounters(Result, UsedRegs[j]); 417 increaseCounters(Result, DefinedRegs[j]); 418 } 419 420 if (Op.isUse()) 421 increaseCounters(Result, DefinedRegs[j]); 422 } 423 } 424 425 return Result; 426 } 427 428 void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB, 429 MachineBasicBlock::iterator I) { 430 if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() < 431 AMDGPUSubtarget::VOLCANIC_ISLANDS) 432 return; 433 434 // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG. 435 if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) { 436 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0); 437 LastInstWritesM0 = false; 438 return; 439 } 440 441 // Set whether this instruction sets M0 442 LastInstWritesM0 = false; 443 444 unsigned NumOperands = I->getNumOperands(); 445 for (unsigned i = 0; i < NumOperands; i++) { 446 const MachineOperand &Op = I->getOperand(i); 447 448 if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0) 449 LastInstWritesM0 = true; 450 } 451 } 452 453 // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" 454 // around other non-memory instructions. 455 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { 456 bool Changes = false; 457 458 TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); 459 TRI = 460 static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); 461 462 MRI = &MF.getRegInfo(); 463 464 WaitedOn = ZeroCounts; 465 LastIssued = ZeroCounts; 466 LastOpcodeType = OTHER; 467 LastInstWritesM0 = false; 468 469 memset(&UsedRegs, 0, sizeof(UsedRegs)); 470 memset(&DefinedRegs, 0, sizeof(DefinedRegs)); 471 472 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 473 BI != BE; ++BI) { 474 475 MachineBasicBlock &MBB = *BI; 476 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); 477 I != E; ++I) { 478 479 // Wait for everything before a barrier. 480 if (I->getOpcode() == AMDGPU::S_BARRIER) 481 Changes |= insertWait(MBB, I, LastIssued); 482 else 483 Changes |= insertWait(MBB, I, handleOperands(*I)); 484 485 pushInstruction(MBB, I); 486 handleSendMsg(MBB, I); 487 } 488 489 // Wait for everything at the end of the MBB 490 Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); 491 } 492 493 return Changes; 494 } 495