1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief SI implementation of the TargetRegisterInfo class. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "SIRegisterInfo.h" 16 #include "SIInstrInfo.h" 17 #include "SIMachineFunctionInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "llvm/CodeGen/MachineFrameInfo.h" 20 #include "llvm/CodeGen/MachineInstrBuilder.h" 21 #include "llvm/CodeGen/RegisterScavenging.h" 22 #include "llvm/IR/Function.h" 23 #include "llvm/IR/LLVMContext.h" 24 25 using namespace llvm; 26 27 static unsigned getMaxWaveCountPerSIMD(const MachineFunction &MF) { 28 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 29 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 30 unsigned SIMDPerCU = 4; 31 32 unsigned MaxInvocationsPerWave = SIMDPerCU * ST.getWavefrontSize(); 33 return alignTo(MFI.getMaximumWorkGroupSize(MF), MaxInvocationsPerWave) / 34 MaxInvocationsPerWave; 35 } 36 37 static unsigned getMaxWorkGroupSGPRCount(const MachineFunction &MF) { 38 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 39 unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF); 40 41 unsigned TotalSGPRCountPerSIMD, AddressableSGPRCount, SGPRUsageAlignment; 42 unsigned ReservedSGPRCount; 43 44 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 45 TotalSGPRCountPerSIMD = 800; 46 AddressableSGPRCount = 102; 47 SGPRUsageAlignment = 16; 48 ReservedSGPRCount = 6; // VCC, FLAT_SCRATCH, XNACK 49 } else { 50 TotalSGPRCountPerSIMD = 512; 51 AddressableSGPRCount = 104; 52 SGPRUsageAlignment = 8; 53 ReservedSGPRCount = 2; // VCC 54 } 55 56 unsigned MaxSGPRCount = (TotalSGPRCountPerSIMD / MaxWaveCountPerSIMD); 57 MaxSGPRCount = alignDown(MaxSGPRCount, SGPRUsageAlignment); 58 59 if (ST.hasSGPRInitBug()) 60 MaxSGPRCount = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; 61 62 return std::min(MaxSGPRCount - ReservedSGPRCount, AddressableSGPRCount); 63 } 64 65 static unsigned getMaxWorkGroupVGPRCount(const MachineFunction &MF) { 66 unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF); 67 unsigned TotalVGPRCountPerSIMD = 256; 68 unsigned VGPRUsageAlignment = 4; 69 70 return alignDown(TotalVGPRCountPerSIMD / MaxWaveCountPerSIMD, 71 VGPRUsageAlignment); 72 } 73 74 static bool hasPressureSet(const int *PSets, unsigned PSetID) { 75 for (unsigned i = 0; PSets[i] != -1; ++i) { 76 if (PSets[i] == (int)PSetID) 77 return true; 78 } 79 return false; 80 } 81 82 void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg, 83 BitVector &PressureSets) const { 84 for (MCRegUnitIterator U(Reg, this); U.isValid(); ++U) { 85 const int *PSets = getRegUnitPressureSets(*U); 86 if (hasPressureSet(PSets, PSetID)) { 87 PressureSets.set(PSetID); 88 break; 89 } 90 } 91 } 92 93 SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo(), 94 SGPRPressureSets(getNumRegPressureSets()), 95 VGPRPressureSets(getNumRegPressureSets()) { 96 unsigned NumRegPressureSets = getNumRegPressureSets(); 97 98 SGPR32SetID = NumRegPressureSets; 99 VGPR32SetID = NumRegPressureSets; 100 for (unsigned i = 0; i < NumRegPressureSets; ++i) { 101 if (strncmp("SGPR_32", getRegPressureSetName(i), 7) == 0) 102 SGPR32SetID = i; 103 else if (strncmp("VGPR_32", getRegPressureSetName(i), 7) == 0) 104 VGPR32SetID = i; 105 106 classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets); 107 classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets); 108 } 109 assert(SGPR32SetID < NumRegPressureSets && 110 VGPR32SetID < NumRegPressureSets); 111 } 112 113 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const { 114 MCRegAliasIterator R(Reg, this, true); 115 116 for (; R.isValid(); ++R) 117 Reserved.set(*R); 118 } 119 120 unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( 121 const MachineFunction &MF) const { 122 unsigned BaseIdx = alignDown(getMaxWorkGroupSGPRCount(MF), 4) - 4; 123 unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); 124 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); 125 } 126 127 unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( 128 const MachineFunction &MF) const { 129 unsigned RegCount = getMaxWorkGroupSGPRCount(MF); 130 unsigned Reg; 131 132 // Try to place it in a hole after PrivateSegmentbufferReg. 133 if (RegCount & 3) { 134 // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to 135 // alignment constraints, so we have a hole where can put the wave offset. 136 Reg = RegCount - 1; 137 } else { 138 // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the 139 // wave offset before it. 140 Reg = RegCount - 5; 141 } 142 return AMDGPU::SGPR_32RegClass.getRegister(Reg); 143 } 144 145 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { 146 BitVector Reserved(getNumRegs()); 147 Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); 148 149 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but 150 // this seems likely to result in bugs, so I'm marking them as reserved. 151 reserveRegisterTuples(Reserved, AMDGPU::EXEC); 152 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); 153 154 // Reserve Trap Handler registers - support is not implemented in Codegen. 155 reserveRegisterTuples(Reserved, AMDGPU::TBA); 156 reserveRegisterTuples(Reserved, AMDGPU::TMA); 157 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); 158 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); 159 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); 160 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); 161 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); 162 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); 163 164 unsigned MaxWorkGroupSGPRCount = getMaxWorkGroupSGPRCount(MF); 165 unsigned MaxWorkGroupVGPRCount = getMaxWorkGroupVGPRCount(MF); 166 167 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 168 unsigned NumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 169 for (unsigned i = MaxWorkGroupSGPRCount; i < NumSGPRs; ++i) { 170 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); 171 reserveRegisterTuples(Reserved, Reg); 172 } 173 174 175 for (unsigned i = MaxWorkGroupVGPRCount; i < NumVGPRs; ++i) { 176 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); 177 reserveRegisterTuples(Reserved, Reg); 178 } 179 180 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 181 182 unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); 183 if (ScratchWaveOffsetReg != AMDGPU::NoRegister) { 184 // Reserve 1 SGPR for scratch wave offset in case we need to spill. 185 reserveRegisterTuples(Reserved, ScratchWaveOffsetReg); 186 } 187 188 unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); 189 if (ScratchRSrcReg != AMDGPU::NoRegister) { 190 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need 191 // to spill. 192 // TODO: May need to reserve a VGPR if doing LDS spilling. 193 reserveRegisterTuples(Reserved, ScratchRSrcReg); 194 assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); 195 } 196 197 // Reserve registers for debugger usage if "amdgpu-debugger-reserve-trap-regs" 198 // attribute was specified. 199 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 200 if (ST.debuggerReserveRegs()) { 201 unsigned ReservedVGPRFirst = 202 MaxWorkGroupVGPRCount - MFI->getDebuggerReservedVGPRCount(); 203 for (unsigned i = ReservedVGPRFirst; i < MaxWorkGroupVGPRCount; ++i) { 204 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); 205 reserveRegisterTuples(Reserved, Reg); 206 } 207 } 208 209 return Reserved; 210 } 211 212 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, 213 unsigned Idx) const { 214 const SISubtarget &STI = MF.getSubtarget<SISubtarget>(); 215 // FIXME: We should adjust the max number of waves based on LDS size. 216 unsigned SGPRLimit = getNumSGPRsAllowed(STI, STI.getMaxWavesPerCU()); 217 unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU()); 218 219 unsigned VSLimit = SGPRLimit + VGPRLimit; 220 221 if (SGPRPressureSets.test(Idx) && VGPRPressureSets.test(Idx)) { 222 // FIXME: This is a hack. We should never be considering the pressure of 223 // these since no virtual register should ever have this class. 224 return VSLimit; 225 } 226 227 if (SGPRPressureSets.test(Idx)) 228 return SGPRLimit; 229 230 return VGPRLimit; 231 } 232 233 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { 234 return Fn.getFrameInfo()->hasStackObjects(); 235 } 236 237 bool 238 SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const { 239 return MF.getFrameInfo()->hasStackObjects(); 240 } 241 242 bool SIRegisterInfo::requiresVirtualBaseRegisters( 243 const MachineFunction &) const { 244 // There are no special dedicated stack or frame pointers. 245 return true; 246 } 247 248 bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { 249 // This helps catch bugs as verifier errors. 250 return true; 251 } 252 253 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, 254 int Idx) const { 255 if (!SIInstrInfo::isMUBUF(*MI)) 256 return 0; 257 258 assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), 259 AMDGPU::OpName::vaddr) && 260 "Should never see frame index on non-address operand"); 261 262 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), 263 AMDGPU::OpName::offset); 264 return MI->getOperand(OffIdx).getImm(); 265 } 266 267 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { 268 return MI->mayLoadOrStore(); 269 } 270 271 void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, 272 unsigned BaseReg, 273 int FrameIdx, 274 int64_t Offset) const { 275 MachineBasicBlock::iterator Ins = MBB->begin(); 276 DebugLoc DL; // Defaults to "unknown" 277 278 if (Ins != MBB->end()) 279 DL = Ins->getDebugLoc(); 280 281 MachineFunction *MF = MBB->getParent(); 282 const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>(); 283 const SIInstrInfo *TII = Subtarget.getInstrInfo(); 284 285 if (Offset == 0) { 286 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg) 287 .addFrameIndex(FrameIdx); 288 return; 289 } 290 291 MachineRegisterInfo &MRI = MF->getRegInfo(); 292 unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 293 unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 294 295 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 296 .addImm(Offset); 297 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_ADD_I32_e64), BaseReg) 298 .addReg(UnusedCarry, RegState::Define | RegState::Dead) 299 .addReg(OffsetReg, RegState::Kill) 300 .addFrameIndex(FrameIdx); 301 } 302 303 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, 304 int64_t Offset) const { 305 306 MachineBasicBlock *MBB = MI.getParent(); 307 MachineFunction *MF = MBB->getParent(); 308 const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>(); 309 const SIInstrInfo *TII = Subtarget.getInstrInfo(); 310 311 #ifndef NDEBUG 312 // FIXME: Is it possible to be storing a frame index to itself? 313 bool SeenFI = false; 314 for (const MachineOperand &MO: MI.operands()) { 315 if (MO.isFI()) { 316 if (SeenFI) 317 llvm_unreachable("should not see multiple frame indices"); 318 319 SeenFI = true; 320 } 321 } 322 #endif 323 324 MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 325 assert(FIOp && FIOp->isFI() && "frame index must be address operand"); 326 327 assert(TII->isMUBUF(MI)); 328 329 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); 330 int64_t NewOffset = OffsetOp->getImm() + Offset; 331 if (isUInt<12>(NewOffset)) { 332 // If we have a legal offset, fold it directly into the instruction. 333 FIOp->ChangeToRegister(BaseReg, false); 334 OffsetOp->setImm(NewOffset); 335 return; 336 } 337 338 // The offset is not legal, so we must insert an add of the offset. 339 MachineRegisterInfo &MRI = MF->getRegInfo(); 340 unsigned NewReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 341 DebugLoc DL = MI.getDebugLoc(); 342 343 assert(Offset != 0 && "Non-zero offset expected"); 344 345 unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 346 unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 347 348 // In the case the instruction already had an immediate offset, here only 349 // the requested new offset is added because we are leaving the original 350 // immediate in place. 351 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) 352 .addImm(Offset); 353 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), NewReg) 354 .addReg(UnusedCarry, RegState::Define | RegState::Dead) 355 .addReg(OffsetReg, RegState::Kill) 356 .addReg(BaseReg); 357 358 FIOp->ChangeToRegister(NewReg, false); 359 } 360 361 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, 362 unsigned BaseReg, 363 int64_t Offset) const { 364 return SIInstrInfo::isMUBUF(*MI) && isUInt<12>(Offset); 365 } 366 367 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( 368 const MachineFunction &MF, unsigned Kind) const { 369 // This is inaccurate. It depends on the instruction and address space. The 370 // only place where we should hit this is for dealing with frame indexes / 371 // private accesses, so this is correct in that case. 372 return &AMDGPU::VGPR_32RegClass; 373 } 374 375 static unsigned getNumSubRegsForSpillOp(unsigned Op) { 376 377 switch (Op) { 378 case AMDGPU::SI_SPILL_S512_SAVE: 379 case AMDGPU::SI_SPILL_S512_RESTORE: 380 case AMDGPU::SI_SPILL_V512_SAVE: 381 case AMDGPU::SI_SPILL_V512_RESTORE: 382 return 16; 383 case AMDGPU::SI_SPILL_S256_SAVE: 384 case AMDGPU::SI_SPILL_S256_RESTORE: 385 case AMDGPU::SI_SPILL_V256_SAVE: 386 case AMDGPU::SI_SPILL_V256_RESTORE: 387 return 8; 388 case AMDGPU::SI_SPILL_S128_SAVE: 389 case AMDGPU::SI_SPILL_S128_RESTORE: 390 case AMDGPU::SI_SPILL_V128_SAVE: 391 case AMDGPU::SI_SPILL_V128_RESTORE: 392 return 4; 393 case AMDGPU::SI_SPILL_V96_SAVE: 394 case AMDGPU::SI_SPILL_V96_RESTORE: 395 return 3; 396 case AMDGPU::SI_SPILL_S64_SAVE: 397 case AMDGPU::SI_SPILL_S64_RESTORE: 398 case AMDGPU::SI_SPILL_V64_SAVE: 399 case AMDGPU::SI_SPILL_V64_RESTORE: 400 return 2; 401 case AMDGPU::SI_SPILL_S32_SAVE: 402 case AMDGPU::SI_SPILL_S32_RESTORE: 403 case AMDGPU::SI_SPILL_V32_SAVE: 404 case AMDGPU::SI_SPILL_V32_RESTORE: 405 return 1; 406 default: llvm_unreachable("Invalid spill opcode"); 407 } 408 } 409 410 void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, 411 unsigned LoadStoreOp, 412 const MachineOperand *SrcDst, 413 unsigned ScratchRsrcReg, 414 unsigned ScratchOffset, 415 int64_t Offset, 416 RegScavenger *RS) const { 417 418 unsigned Value = SrcDst->getReg(); 419 bool IsKill = SrcDst->isKill(); 420 MachineBasicBlock *MBB = MI->getParent(); 421 MachineFunction *MF = MI->getParent()->getParent(); 422 const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); 423 const SIInstrInfo *TII = ST.getInstrInfo(); 424 425 DebugLoc DL = MI->getDebugLoc(); 426 bool IsStore = MI->mayStore(); 427 428 bool RanOutOfSGPRs = false; 429 bool Scavenged = false; 430 unsigned SOffset = ScratchOffset; 431 unsigned OriginalImmOffset = Offset; 432 433 unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); 434 unsigned Size = NumSubRegs * 4; 435 436 if (!isUInt<12>(Offset + Size)) { 437 SOffset = AMDGPU::NoRegister; 438 439 // We don't have access to the register scavenger if this function is called 440 // during PEI::scavengeFrameVirtualRegs(). 441 if (RS) 442 SOffset = RS->FindUnusedReg(&AMDGPU::SGPR_32RegClass); 443 444 if (SOffset == AMDGPU::NoRegister) { 445 // There are no free SGPRs, and since we are in the process of spilling 446 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true 447 // on SI/CI and on VI it is true until we implement spilling using scalar 448 // stores), we have no way to free up an SGPR. Our solution here is to 449 // add the offset directly to the ScratchOffset register, and then 450 // subtract the offset after the spill to return ScratchOffset to it's 451 // original value. 452 RanOutOfSGPRs = true; 453 SOffset = ScratchOffset; 454 } else { 455 Scavenged = true; 456 } 457 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) 458 .addReg(ScratchOffset) 459 .addImm(Offset); 460 Offset = 0; 461 } 462 463 for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) { 464 unsigned SubReg = NumSubRegs > 1 ? 465 getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) : 466 Value; 467 468 unsigned SOffsetRegState = 0; 469 unsigned SrcDstRegState = getDefRegState(!IsStore); 470 if (i + 1 == e) { 471 SOffsetRegState |= getKillRegState(Scavenged); 472 // The last implicit use carries the "Kill" flag. 473 SrcDstRegState |= getKillRegState(IsKill); 474 } 475 476 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) 477 .addReg(SubReg, getDefRegState(!IsStore)) 478 .addReg(ScratchRsrcReg) 479 .addReg(SOffset, SOffsetRegState) 480 .addImm(Offset) 481 .addImm(0) // glc 482 .addImm(0) // slc 483 .addImm(0) // tfe 484 .addReg(Value, RegState::Implicit | SrcDstRegState) 485 .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 486 } 487 if (RanOutOfSGPRs) { 488 // Subtract the offset we added to the ScratchOffset register. 489 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffset) 490 .addReg(ScratchOffset) 491 .addImm(OriginalImmOffset); 492 } 493 } 494 495 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, 496 int SPAdj, unsigned FIOperandNum, 497 RegScavenger *RS) const { 498 MachineFunction *MF = MI->getParent()->getParent(); 499 MachineRegisterInfo &MRI = MF->getRegInfo(); 500 MachineBasicBlock *MBB = MI->getParent(); 501 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 502 MachineFrameInfo *FrameInfo = MF->getFrameInfo(); 503 const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); 504 const SIInstrInfo *TII = ST.getInstrInfo(); 505 DebugLoc DL = MI->getDebugLoc(); 506 507 MachineOperand &FIOp = MI->getOperand(FIOperandNum); 508 int Index = MI->getOperand(FIOperandNum).getIndex(); 509 510 switch (MI->getOpcode()) { 511 // SGPR register spill 512 case AMDGPU::SI_SPILL_S512_SAVE: 513 case AMDGPU::SI_SPILL_S256_SAVE: 514 case AMDGPU::SI_SPILL_S128_SAVE: 515 case AMDGPU::SI_SPILL_S64_SAVE: 516 case AMDGPU::SI_SPILL_S32_SAVE: { 517 unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); 518 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 519 520 unsigned SuperReg = MI->getOperand(0).getReg(); 521 bool IsKill = MI->getOperand(0).isKill(); 522 // SubReg carries the "Kill" flag when SubReg == SuperReg. 523 unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); 524 for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { 525 unsigned SubReg = getPhysRegSubReg(SuperReg, 526 &AMDGPU::SGPR_32RegClass, i); 527 528 struct SIMachineFunctionInfo::SpilledReg Spill = 529 MFI->getSpilledReg(MF, Index, i); 530 531 if (Spill.hasReg()) { 532 BuildMI(*MBB, MI, DL, 533 TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), 534 Spill.VGPR) 535 .addReg(SubReg, getKillRegState(IsKill)) 536 .addImm(Spill.Lane); 537 538 // FIXME: Since this spills to another register instead of an actual 539 // frame index, we should delete the frame index when all references to 540 // it are fixed. 541 } else { 542 // Spill SGPR to a frame index. 543 // FIXME we should use S_STORE_DWORD here for VI. 544 MachineInstrBuilder Mov 545 = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 546 .addReg(SubReg, SubKillState); 547 548 549 // There could be undef components of a spilled super register. 550 // TODO: Can we detect this and skip the spill? 551 if (NumSubRegs > 1) { 552 // The last implicit use of the SuperReg carries the "Kill" flag. 553 unsigned SuperKillState = 0; 554 if (i + 1 == e) 555 SuperKillState |= getKillRegState(IsKill); 556 Mov.addReg(SuperReg, RegState::Implicit | SuperKillState); 557 } 558 559 unsigned Size = FrameInfo->getObjectSize(Index); 560 unsigned Align = FrameInfo->getObjectAlignment(Index); 561 MachinePointerInfo PtrInfo 562 = MachinePointerInfo::getFixedStack(*MF, Index); 563 MachineMemOperand *MMO 564 = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 565 Size, Align); 566 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) 567 .addReg(TmpReg, RegState::Kill) // src 568 .addFrameIndex(Index) // frame_idx 569 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 570 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 571 .addImm(i * 4) // offset 572 .addMemOperand(MMO); 573 } 574 } 575 MI->eraseFromParent(); 576 MFI->addToSpilledSGPRs(NumSubRegs); 577 break; 578 } 579 580 // SGPR register restore 581 case AMDGPU::SI_SPILL_S512_RESTORE: 582 case AMDGPU::SI_SPILL_S256_RESTORE: 583 case AMDGPU::SI_SPILL_S128_RESTORE: 584 case AMDGPU::SI_SPILL_S64_RESTORE: 585 case AMDGPU::SI_SPILL_S32_RESTORE: { 586 unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); 587 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 588 589 for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { 590 unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), 591 &AMDGPU::SGPR_32RegClass, i); 592 struct SIMachineFunctionInfo::SpilledReg Spill = 593 MFI->getSpilledReg(MF, Index, i); 594 595 if (Spill.hasReg()) { 596 BuildMI(*MBB, MI, DL, 597 TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), 598 SubReg) 599 .addReg(Spill.VGPR) 600 .addImm(Spill.Lane) 601 .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); 602 } else { 603 // Restore SGPR from a stack slot. 604 // FIXME: We should use S_LOAD_DWORD here for VI. 605 606 unsigned Align = FrameInfo->getObjectAlignment(Index); 607 unsigned Size = FrameInfo->getObjectSize(Index); 608 609 MachinePointerInfo PtrInfo 610 = MachinePointerInfo::getFixedStack(*MF, Index); 611 612 MachineMemOperand *MMO = MF->getMachineMemOperand( 613 PtrInfo, MachineMemOperand::MOLoad, Size, Align); 614 615 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) 616 .addFrameIndex(Index) // frame_idx 617 .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc 618 .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset 619 .addImm(i * 4) // offset 620 .addMemOperand(MMO); 621 BuildMI(*MBB, MI, DL, 622 TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) 623 .addReg(TmpReg, RegState::Kill) 624 .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); 625 } 626 } 627 628 MI->eraseFromParent(); 629 break; 630 } 631 632 // VGPR register spill 633 case AMDGPU::SI_SPILL_V512_SAVE: 634 case AMDGPU::SI_SPILL_V256_SAVE: 635 case AMDGPU::SI_SPILL_V128_SAVE: 636 case AMDGPU::SI_SPILL_V96_SAVE: 637 case AMDGPU::SI_SPILL_V64_SAVE: 638 case AMDGPU::SI_SPILL_V32_SAVE: 639 buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, 640 TII->getNamedOperand(*MI, AMDGPU::OpName::src), 641 TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), 642 TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), 643 FrameInfo->getObjectOffset(Index) + 644 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS); 645 MI->eraseFromParent(); 646 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); 647 break; 648 case AMDGPU::SI_SPILL_V32_RESTORE: 649 case AMDGPU::SI_SPILL_V64_RESTORE: 650 case AMDGPU::SI_SPILL_V96_RESTORE: 651 case AMDGPU::SI_SPILL_V128_RESTORE: 652 case AMDGPU::SI_SPILL_V256_RESTORE: 653 case AMDGPU::SI_SPILL_V512_RESTORE: { 654 buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, 655 TII->getNamedOperand(*MI, AMDGPU::OpName::dst), 656 TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), 657 TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), 658 FrameInfo->getObjectOffset(Index) + 659 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS); 660 MI->eraseFromParent(); 661 break; 662 } 663 664 default: { 665 int64_t Offset = FrameInfo->getObjectOffset(Index); 666 FIOp.ChangeToImmediate(Offset); 667 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { 668 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 669 BuildMI(*MBB, MI, MI->getDebugLoc(), 670 TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) 671 .addImm(Offset); 672 FIOp.ChangeToRegister(TmpReg, false, false, true); 673 } 674 } 675 } 676 } 677 678 // FIXME: This is very slow. It might be worth creating a map from physreg to 679 // register class. 680 const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { 681 assert(!TargetRegisterInfo::isVirtualRegister(Reg)); 682 683 static const TargetRegisterClass *const BaseClasses[] = { 684 &AMDGPU::VGPR_32RegClass, 685 &AMDGPU::SReg_32RegClass, 686 &AMDGPU::VReg_64RegClass, 687 &AMDGPU::SReg_64RegClass, 688 &AMDGPU::VReg_96RegClass, 689 &AMDGPU::VReg_128RegClass, 690 &AMDGPU::SReg_128RegClass, 691 &AMDGPU::VReg_256RegClass, 692 &AMDGPU::SReg_256RegClass, 693 &AMDGPU::VReg_512RegClass, 694 &AMDGPU::SReg_512RegClass, 695 &AMDGPU::SCC_CLASSRegClass, 696 }; 697 698 for (const TargetRegisterClass *BaseClass : BaseClasses) { 699 if (BaseClass->contains(Reg)) { 700 return BaseClass; 701 } 702 } 703 return nullptr; 704 } 705 706 // TODO: It might be helpful to have some target specific flags in 707 // TargetRegisterClass to mark which classes are VGPRs to make this trivial. 708 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { 709 switch (RC->getSize()) { 710 case 0: return false; 711 case 1: return false; 712 case 4: 713 return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; 714 case 8: 715 return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr; 716 case 12: 717 return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr; 718 case 16: 719 return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr; 720 case 32: 721 return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr; 722 case 64: 723 return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; 724 default: 725 llvm_unreachable("Invalid register class size"); 726 } 727 } 728 729 const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( 730 const TargetRegisterClass *SRC) const { 731 switch (SRC->getSize()) { 732 case 4: 733 return &AMDGPU::VGPR_32RegClass; 734 case 8: 735 return &AMDGPU::VReg_64RegClass; 736 case 12: 737 return &AMDGPU::VReg_96RegClass; 738 case 16: 739 return &AMDGPU::VReg_128RegClass; 740 case 32: 741 return &AMDGPU::VReg_256RegClass; 742 case 64: 743 return &AMDGPU::VReg_512RegClass; 744 default: 745 llvm_unreachable("Invalid register class size"); 746 } 747 } 748 749 const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass( 750 const TargetRegisterClass *VRC) const { 751 switch (VRC->getSize()) { 752 case 4: 753 return &AMDGPU::SGPR_32RegClass; 754 case 8: 755 return &AMDGPU::SReg_64RegClass; 756 case 16: 757 return &AMDGPU::SReg_128RegClass; 758 case 32: 759 return &AMDGPU::SReg_256RegClass; 760 case 64: 761 return &AMDGPU::SReg_512RegClass; 762 default: 763 llvm_unreachable("Invalid register class size"); 764 } 765 } 766 767 const TargetRegisterClass *SIRegisterInfo::getSubRegClass( 768 const TargetRegisterClass *RC, unsigned SubIdx) const { 769 if (SubIdx == AMDGPU::NoSubRegister) 770 return RC; 771 772 // We can assume that each lane corresponds to one 32-bit register. 773 unsigned Count = countPopulation(getSubRegIndexLaneMask(SubIdx)); 774 if (isSGPRClass(RC)) { 775 switch (Count) { 776 case 1: 777 return &AMDGPU::SGPR_32RegClass; 778 case 2: 779 return &AMDGPU::SReg_64RegClass; 780 case 4: 781 return &AMDGPU::SReg_128RegClass; 782 case 8: 783 return &AMDGPU::SReg_256RegClass; 784 case 16: /* fall-through */ 785 default: 786 llvm_unreachable("Invalid sub-register class size"); 787 } 788 } else { 789 switch (Count) { 790 case 1: 791 return &AMDGPU::VGPR_32RegClass; 792 case 2: 793 return &AMDGPU::VReg_64RegClass; 794 case 3: 795 return &AMDGPU::VReg_96RegClass; 796 case 4: 797 return &AMDGPU::VReg_128RegClass; 798 case 8: 799 return &AMDGPU::VReg_256RegClass; 800 case 16: /* fall-through */ 801 default: 802 llvm_unreachable("Invalid sub-register class size"); 803 } 804 } 805 } 806 807 bool SIRegisterInfo::shouldRewriteCopySrc( 808 const TargetRegisterClass *DefRC, 809 unsigned DefSubReg, 810 const TargetRegisterClass *SrcRC, 811 unsigned SrcSubReg) const { 812 // We want to prefer the smallest register class possible, so we don't want to 813 // stop and rewrite on anything that looks like a subregister 814 // extract. Operations mostly don't care about the super register class, so we 815 // only want to stop on the most basic of copies between the smae register 816 // class. 817 // 818 // e.g. if we have something like 819 // vreg0 = ... 820 // vreg1 = ... 821 // vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1, vreg2, sub2 822 // vreg3 = COPY vreg2, sub0 823 // 824 // We want to look through the COPY to find: 825 // => vreg3 = COPY vreg0 826 827 // Plain copy. 828 return getCommonSubClass(DefRC, SrcRC) != nullptr; 829 } 830 831 unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg, 832 const TargetRegisterClass *SubRC, 833 unsigned Channel) const { 834 835 switch (Reg) { 836 case AMDGPU::VCC: 837 switch(Channel) { 838 case 0: return AMDGPU::VCC_LO; 839 case 1: return AMDGPU::VCC_HI; 840 default: llvm_unreachable("Invalid SubIdx for VCC"); break; 841 } 842 843 case AMDGPU::TBA: 844 switch(Channel) { 845 case 0: return AMDGPU::TBA_LO; 846 case 1: return AMDGPU::TBA_HI; 847 default: llvm_unreachable("Invalid SubIdx for TBA"); break; 848 } 849 850 case AMDGPU::TMA: 851 switch(Channel) { 852 case 0: return AMDGPU::TMA_LO; 853 case 1: return AMDGPU::TMA_HI; 854 default: llvm_unreachable("Invalid SubIdx for TMA"); break; 855 } 856 857 case AMDGPU::FLAT_SCR: 858 switch (Channel) { 859 case 0: 860 return AMDGPU::FLAT_SCR_LO; 861 case 1: 862 return AMDGPU::FLAT_SCR_HI; 863 default: 864 llvm_unreachable("Invalid SubIdx for FLAT_SCR"); 865 } 866 break; 867 868 case AMDGPU::EXEC: 869 switch (Channel) { 870 case 0: 871 return AMDGPU::EXEC_LO; 872 case 1: 873 return AMDGPU::EXEC_HI; 874 default: 875 llvm_unreachable("Invalid SubIdx for EXEC"); 876 } 877 break; 878 } 879 880 const TargetRegisterClass *RC = getPhysRegClass(Reg); 881 // 32-bit registers don't have sub-registers, so we can just return the 882 // Reg. We need to have this check here, because the calculation below 883 // using getHWRegIndex() will fail with special 32-bit registers like 884 // VCC_LO, VCC_HI, EXEC_LO, EXEC_HI and M0. 885 if (RC->getSize() == 4) { 886 assert(Channel == 0); 887 return Reg; 888 } 889 890 unsigned Index = getHWRegIndex(Reg); 891 return SubRC->getRegister(Index + Channel); 892 } 893 894 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { 895 return OpType == AMDGPU::OPERAND_REG_IMM32; 896 } 897 898 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { 899 if (opCanUseLiteralConstant(OpType)) 900 return true; 901 902 return OpType == AMDGPU::OPERAND_REG_INLINE_C; 903 } 904 905 // FIXME: Most of these are flexible with HSA and we don't need to reserve them 906 // as input registers if unused. Whether the dispatch ptr is necessary should be 907 // easy to detect from used intrinsics. Scratch setup is harder to know. 908 unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, 909 enum PreloadedValue Value) const { 910 911 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 912 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 913 (void)ST; 914 switch (Value) { 915 case SIRegisterInfo::WORKGROUP_ID_X: 916 assert(MFI->hasWorkGroupIDX()); 917 return MFI->WorkGroupIDXSystemSGPR; 918 case SIRegisterInfo::WORKGROUP_ID_Y: 919 assert(MFI->hasWorkGroupIDY()); 920 return MFI->WorkGroupIDYSystemSGPR; 921 case SIRegisterInfo::WORKGROUP_ID_Z: 922 assert(MFI->hasWorkGroupIDZ()); 923 return MFI->WorkGroupIDZSystemSGPR; 924 case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: 925 return MFI->PrivateSegmentWaveByteOffsetSystemSGPR; 926 case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER: 927 assert(ST.isAmdHsaOS() && "Non-HSA ABI currently uses relocations"); 928 assert(MFI->hasPrivateSegmentBuffer()); 929 return MFI->PrivateSegmentBufferUserSGPR; 930 case SIRegisterInfo::KERNARG_SEGMENT_PTR: 931 assert(MFI->hasKernargSegmentPtr()); 932 return MFI->KernargSegmentPtrUserSGPR; 933 case SIRegisterInfo::DISPATCH_ID: 934 llvm_unreachable("unimplemented"); 935 case SIRegisterInfo::FLAT_SCRATCH_INIT: 936 assert(MFI->hasFlatScratchInit()); 937 return MFI->FlatScratchInitUserSGPR; 938 case SIRegisterInfo::DISPATCH_PTR: 939 assert(MFI->hasDispatchPtr()); 940 return MFI->DispatchPtrUserSGPR; 941 case SIRegisterInfo::QUEUE_PTR: 942 assert(MFI->hasQueuePtr()); 943 return MFI->QueuePtrUserSGPR; 944 case SIRegisterInfo::WORKITEM_ID_X: 945 assert(MFI->hasWorkItemIDX()); 946 return AMDGPU::VGPR0; 947 case SIRegisterInfo::WORKITEM_ID_Y: 948 assert(MFI->hasWorkItemIDY()); 949 return AMDGPU::VGPR1; 950 case SIRegisterInfo::WORKITEM_ID_Z: 951 assert(MFI->hasWorkItemIDZ()); 952 return AMDGPU::VGPR2; 953 } 954 llvm_unreachable("unexpected preloaded value type"); 955 } 956 957 /// \brief Returns a register that is not used at any point in the function. 958 /// If all registers are used, then this function will return 959 // AMDGPU::NoRegister. 960 unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, 961 const TargetRegisterClass *RC) const { 962 for (unsigned Reg : *RC) 963 if (!MRI.isPhysRegUsed(Reg)) 964 return Reg; 965 return AMDGPU::NoRegister; 966 } 967 968 unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const { 969 switch(WaveCount) { 970 case 10: return 24; 971 case 9: return 28; 972 case 8: return 32; 973 case 7: return 36; 974 case 6: return 40; 975 case 5: return 48; 976 case 4: return 64; 977 case 3: return 84; 978 case 2: return 128; 979 default: return 256; 980 } 981 } 982 983 unsigned SIRegisterInfo::getNumSGPRsAllowed(const SISubtarget &ST, 984 unsigned WaveCount) const { 985 if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 986 switch (WaveCount) { 987 case 10: return 80; 988 case 9: return 80; 989 case 8: return 96; 990 default: return 102; 991 } 992 } else { 993 switch(WaveCount) { 994 case 10: return 48; 995 case 9: return 56; 996 case 8: return 64; 997 case 7: return 72; 998 case 6: return 80; 999 case 5: return 96; 1000 default: return 103; 1001 } 1002 } 1003 } 1004 1005 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, 1006 unsigned Reg) const { 1007 const TargetRegisterClass *RC; 1008 if (TargetRegisterInfo::isVirtualRegister(Reg)) 1009 RC = MRI.getRegClass(Reg); 1010 else 1011 RC = getPhysRegClass(Reg); 1012 1013 return hasVGPRs(RC); 1014 } 1015