1 //===----------------------- SIFrameLowering.cpp --------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //==-----------------------------------------------------------------------===// 9 10 #include "SIFrameLowering.h" 11 #include "AMDGPUSubtarget.h" 12 #include "SIInstrInfo.h" 13 #include "SIMachineFunctionInfo.h" 14 #include "SIRegisterInfo.h" 15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 16 17 #include "llvm/CodeGen/LivePhysRegs.h" 18 #include "llvm/CodeGen/MachineFrameInfo.h" 19 #include "llvm/CodeGen/MachineFunction.h" 20 #include "llvm/CodeGen/MachineInstrBuilder.h" 21 #include "llvm/CodeGen/RegisterScavenging.h" 22 23 using namespace llvm; 24 25 26 static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST, 27 const MachineFunction &MF) { 28 return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), 29 ST.getMaxNumSGPRs(MF) / 4); 30 } 31 32 static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST, 33 const MachineFunction &MF) { 34 return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), 35 ST.getMaxNumSGPRs(MF)); 36 } 37 38 void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, 39 MachineFunction &MF, 40 MachineBasicBlock &MBB) const { 41 const SIInstrInfo *TII = ST.getInstrInfo(); 42 const SIRegisterInfo* TRI = &TII->getRegisterInfo(); 43 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 44 45 // We don't need this if we only have spills since there is no user facing 46 // scratch. 47 48 // TODO: If we know we don't have flat instructions earlier, we can omit 49 // this from the input registers. 50 // 51 // TODO: We only need to know if we access scratch space through a flat 52 // pointer. Because we only detect if flat instructions are used at all, 53 // this will be used more often than necessary on VI. 54 55 // Debug location must be unknown since the first debug location is used to 56 // determine the end of the prologue. 57 DebugLoc DL; 58 MachineBasicBlock::iterator I = MBB.begin(); 59 60 unsigned FlatScratchInitReg 61 = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); 62 63 MachineRegisterInfo &MRI = MF.getRegInfo(); 64 MRI.addLiveIn(FlatScratchInitReg); 65 MBB.addLiveIn(FlatScratchInitReg); 66 67 unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); 68 unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); 69 70 unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); 71 72 // Do a 64-bit pointer add. 73 if (ST.flatScratchIsPointer()) { 74 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO) 75 .addReg(FlatScrInitLo) 76 .addReg(ScratchWaveOffsetReg); 77 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI) 78 .addReg(FlatScrInitHi) 79 .addImm(0); 80 81 return; 82 } 83 84 // Copy the size in bytes. 85 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO) 86 .addReg(FlatScrInitHi, RegState::Kill); 87 88 // Add wave offset in bytes to private base offset. 89 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. 90 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) 91 .addReg(FlatScrInitLo) 92 .addReg(ScratchWaveOffsetReg); 93 94 // Convert offset to 256-byte units. 95 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI) 96 .addReg(FlatScrInitLo, RegState::Kill) 97 .addImm(8); 98 } 99 100 unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( 101 const GCNSubtarget &ST, 102 const SIInstrInfo *TII, 103 const SIRegisterInfo *TRI, 104 SIMachineFunctionInfo *MFI, 105 MachineFunction &MF) const { 106 MachineRegisterInfo &MRI = MF.getRegInfo(); 107 108 // We need to insert initialization of the scratch resource descriptor. 109 unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); 110 if (ScratchRsrcReg == AMDGPU::NoRegister || 111 !MRI.isPhysRegUsed(ScratchRsrcReg)) 112 return AMDGPU::NoRegister; 113 114 if (ST.hasSGPRInitBug() || 115 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF)) 116 return ScratchRsrcReg; 117 118 // We reserved the last registers for this. Shift it down to the end of those 119 // which were actually used. 120 // 121 // FIXME: It might be safer to use a pseudoregister before replacement. 122 123 // FIXME: We should be able to eliminate unused input registers. We only 124 // cannot do this for the resources required for scratch access. For now we 125 // skip over user SGPRs and may leave unused holes. 126 127 // We find the resource first because it has an alignment requirement. 128 129 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; 130 ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF); 131 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded)); 132 133 // Skip the last N reserved elements because they should have already been 134 // reserved for VCC etc. 135 for (MCPhysReg Reg : AllSGPR128s) { 136 // Pick the first unallocated one. Make sure we don't clobber the other 137 // reserved input we needed. 138 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { 139 MRI.replaceRegWith(ScratchRsrcReg, Reg); 140 MFI->setScratchRSrcReg(Reg); 141 return Reg; 142 } 143 } 144 145 return ScratchRsrcReg; 146 } 147 148 // Shift down registers reserved for the scratch wave offset and stack pointer 149 // SGPRs. 150 std::pair<unsigned, unsigned> 151 SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( 152 const GCNSubtarget &ST, 153 const SIInstrInfo *TII, 154 const SIRegisterInfo *TRI, 155 SIMachineFunctionInfo *MFI, 156 MachineFunction &MF) const { 157 MachineRegisterInfo &MRI = MF.getRegInfo(); 158 unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); 159 160 // No replacement necessary. 161 if (ScratchWaveOffsetReg == AMDGPU::NoRegister || 162 !MRI.isPhysRegUsed(ScratchWaveOffsetReg)) { 163 assert(MFI->getStackPtrOffsetReg() == AMDGPU::SP_REG); 164 return std::make_pair(AMDGPU::NoRegister, AMDGPU::NoRegister); 165 } 166 167 unsigned SPReg = MFI->getStackPtrOffsetReg(); 168 if (ST.hasSGPRInitBug()) 169 return std::make_pair(ScratchWaveOffsetReg, SPReg); 170 171 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); 172 173 ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF); 174 if (NumPreloaded > AllSGPRs.size()) 175 return std::make_pair(ScratchWaveOffsetReg, SPReg); 176 177 AllSGPRs = AllSGPRs.slice(NumPreloaded); 178 179 // We need to drop register from the end of the list that we cannot use 180 // for the scratch wave offset. 181 // + 2 s102 and s103 do not exist on VI. 182 // + 2 for vcc 183 // + 2 for xnack_mask 184 // + 2 for flat_scratch 185 // + 4 for registers reserved for scratch resource register 186 // + 1 for register reserved for scratch wave offset. (By exluding this 187 // register from the list to consider, it means that when this 188 // register is being used for the scratch wave offset and there 189 // are no other free SGPRs, then the value will stay in this register. 190 // + 1 if stack pointer is used. 191 // ---- 192 // 13 (+1) 193 unsigned ReservedRegCount = 13; 194 195 if (AllSGPRs.size() < ReservedRegCount) 196 return std::make_pair(ScratchWaveOffsetReg, SPReg); 197 198 bool HandledScratchWaveOffsetReg = 199 ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); 200 201 for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) { 202 // Pick the first unallocated SGPR. Be careful not to pick an alias of the 203 // scratch descriptor, since we havent added its uses yet. 204 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { 205 if (!HandledScratchWaveOffsetReg) { 206 HandledScratchWaveOffsetReg = true; 207 208 MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); 209 MFI->setScratchWaveOffsetReg(Reg); 210 ScratchWaveOffsetReg = Reg; 211 break; 212 } 213 } 214 } 215 216 return std::make_pair(ScratchWaveOffsetReg, SPReg); 217 } 218 219 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, 220 MachineBasicBlock &MBB) const { 221 // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was 222 // specified. 223 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 224 if (ST.debuggerEmitPrologue()) 225 emitDebuggerPrologue(MF, MBB); 226 227 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); 228 229 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 230 231 // If we only have SGPR spills, we won't actually be using scratch memory 232 // since these spill to VGPRs. 233 // 234 // FIXME: We should be cleaning up these unused SGPR spill frame indices 235 // somewhere. 236 237 const SIInstrInfo *TII = ST.getInstrInfo(); 238 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 239 MachineRegisterInfo &MRI = MF.getRegInfo(); 240 const Function &F = MF.getFunction(); 241 242 // We need to do the replacement of the private segment buffer and wave offset 243 // register even if there are no stack objects. There could be stores to undef 244 // or a constant without an associated object. 245 246 // FIXME: We still have implicit uses on SGPR spill instructions in case they 247 // need to spill to vector memory. It's likely that will not happen, but at 248 // this point it appears we need the setup. This part of the prolog should be 249 // emitted after frame indices are eliminated. 250 251 if (MFI->hasFlatScratchInit()) 252 emitFlatScratchInit(ST, MF, MBB); 253 254 unsigned SPReg = MFI->getStackPtrOffsetReg(); 255 if (SPReg != AMDGPU::SP_REG) { 256 assert(MRI.isReserved(SPReg) && "SPReg used but not reserved"); 257 258 DebugLoc DL; 259 const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 260 int64_t StackSize = FrameInfo.getStackSize(); 261 262 if (StackSize == 0) { 263 BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg) 264 .addReg(MFI->getScratchWaveOffsetReg()); 265 } else { 266 BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::S_ADD_U32), SPReg) 267 .addReg(MFI->getScratchWaveOffsetReg()) 268 .addImm(StackSize * ST.getWavefrontSize()); 269 } 270 } 271 272 unsigned ScratchRsrcReg 273 = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF); 274 275 unsigned ScratchWaveOffsetReg; 276 std::tie(ScratchWaveOffsetReg, SPReg) 277 = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); 278 279 // It's possible to have uses of only ScratchWaveOffsetReg without 280 // ScratchRsrcReg if it's only used for the initialization of flat_scratch, 281 // but the inverse is not true. 282 if (ScratchWaveOffsetReg == AMDGPU::NoRegister) { 283 assert(ScratchRsrcReg == AMDGPU::NoRegister); 284 return; 285 } 286 287 // We need to insert initialization of the scratch resource descriptor. 288 unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( 289 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 290 291 unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; 292 if (ST.isAmdCodeObjectV2(F)) { 293 PreloadedPrivateBufferReg = MFI->getPreloadedReg( 294 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); 295 } 296 297 bool OffsetRegUsed = MRI.isPhysRegUsed(ScratchWaveOffsetReg); 298 bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister && 299 MRI.isPhysRegUsed(ScratchRsrcReg); 300 301 // We added live-ins during argument lowering, but since they were not used 302 // they were deleted. We're adding the uses now, so add them back. 303 if (OffsetRegUsed) { 304 assert(PreloadedScratchWaveOffsetReg != AMDGPU::NoRegister && 305 "scratch wave offset input is required"); 306 MRI.addLiveIn(PreloadedScratchWaveOffsetReg); 307 MBB.addLiveIn(PreloadedScratchWaveOffsetReg); 308 } 309 310 if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) { 311 assert(ST.isAmdCodeObjectV2(F) || ST.isMesaGfxShader(F)); 312 MRI.addLiveIn(PreloadedPrivateBufferReg); 313 MBB.addLiveIn(PreloadedPrivateBufferReg); 314 } 315 316 // Make the register selected live throughout the function. 317 for (MachineBasicBlock &OtherBB : MF) { 318 if (&OtherBB == &MBB) 319 continue; 320 321 if (OffsetRegUsed) 322 OtherBB.addLiveIn(ScratchWaveOffsetReg); 323 324 if (ResourceRegUsed) 325 OtherBB.addLiveIn(ScratchRsrcReg); 326 } 327 328 DebugLoc DL; 329 MachineBasicBlock::iterator I = MBB.begin(); 330 331 // If we reserved the original input registers, we don't need to copy to the 332 // reserved registers. 333 334 bool CopyBuffer = ResourceRegUsed && 335 PreloadedPrivateBufferReg != AMDGPU::NoRegister && 336 ST.isAmdCodeObjectV2(F) && 337 ScratchRsrcReg != PreloadedPrivateBufferReg; 338 339 // This needs to be careful of the copying order to avoid overwriting one of 340 // the input registers before it's been copied to it's final 341 // destination. Usually the offset should be copied first. 342 bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg, 343 ScratchWaveOffsetReg); 344 if (CopyBuffer && CopyBufferFirst) { 345 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) 346 .addReg(PreloadedPrivateBufferReg, RegState::Kill); 347 } 348 349 if (OffsetRegUsed && 350 PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { 351 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) 352 .addReg(PreloadedScratchWaveOffsetReg, 353 MRI.isPhysRegUsed(ScratchWaveOffsetReg) ? 0 : RegState::Kill); 354 } 355 356 if (CopyBuffer && !CopyBufferFirst) { 357 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) 358 .addReg(PreloadedPrivateBufferReg, RegState::Kill); 359 } 360 361 if (ResourceRegUsed) 362 emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I, 363 PreloadedPrivateBufferReg, ScratchRsrcReg); 364 } 365 366 // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set. 367 void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, 368 MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI, 369 MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg, 370 unsigned ScratchRsrcReg) const { 371 372 const SIInstrInfo *TII = ST.getInstrInfo(); 373 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 374 const Function &Fn = MF.getFunction(); 375 DebugLoc DL; 376 377 if (ST.isAmdPalOS()) { 378 // The pointer to the GIT is formed from the offset passed in and either 379 // the amdgpu-git-ptr-high function attribute or the top part of the PC 380 unsigned RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 381 unsigned RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 382 unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 383 384 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 385 386 if (MFI->getGITPtrHigh() != 0xffffffff) { 387 BuildMI(MBB, I, DL, SMovB32, RsrcHi) 388 .addImm(MFI->getGITPtrHigh()) 389 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 390 } else { 391 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64); 392 BuildMI(MBB, I, DL, GetPC64, Rsrc01); 393 } 394 auto GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in 395 if (ST.hasMergedShaders()) { 396 switch (MF.getFunction().getCallingConv()) { 397 case CallingConv::AMDGPU_HS: 398 case CallingConv::AMDGPU_GS: 399 // Low GIT address is passed in s8 rather than s0 for an LS+HS or 400 // ES+GS merged shader on gfx9+. 401 GitPtrLo = AMDGPU::SGPR8; 402 break; 403 default: 404 break; 405 } 406 } 407 MF.getRegInfo().addLiveIn(GitPtrLo); 408 MF.front().addLiveIn(GitPtrLo); 409 BuildMI(MBB, I, DL, SMovB32, RsrcLo) 410 .addReg(GitPtrLo) 411 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 412 413 // We now have the GIT ptr - now get the scratch descriptor from the entry 414 // at offset 0 (or offset 16 for a compute shader). 415 PointerType *PtrTy = 416 PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()), 417 AMDGPUAS::CONSTANT_ADDRESS); 418 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); 419 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM); 420 auto MMO = MF.getMachineMemOperand(PtrInfo, 421 MachineMemOperand::MOLoad | 422 MachineMemOperand::MOInvariant | 423 MachineMemOperand::MODereferenceable, 424 0, 0); 425 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; 426 BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg) 427 .addReg(Rsrc01) 428 .addImm(Offset) // offset 429 .addImm(0) // glc 430 .addReg(ScratchRsrcReg, RegState::ImplicitDefine) 431 .addMemOperand(MMO); 432 return; 433 } 434 if (ST.isMesaGfxShader(Fn) 435 || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) { 436 assert(!ST.isAmdCodeObjectV2(Fn)); 437 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); 438 439 unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); 440 unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); 441 442 // Use relocations to get the pointer, and setup the other bits manually. 443 uint64_t Rsrc23 = TII->getScratchRsrcWords23(); 444 445 if (MFI->hasImplicitBufferPtr()) { 446 unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); 447 448 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { 449 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); 450 451 BuildMI(MBB, I, DL, Mov64, Rsrc01) 452 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 453 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 454 } else { 455 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); 456 457 PointerType *PtrTy = 458 PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()), 459 AMDGPUAS::CONSTANT_ADDRESS); 460 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); 461 auto MMO = MF.getMachineMemOperand(PtrInfo, 462 MachineMemOperand::MOLoad | 463 MachineMemOperand::MOInvariant | 464 MachineMemOperand::MODereferenceable, 465 0, 0); 466 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) 467 .addReg(MFI->getImplicitBufferPtrUserSGPR()) 468 .addImm(0) // offset 469 .addImm(0) // glc 470 .addMemOperand(MMO) 471 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 472 } 473 } else { 474 unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); 475 unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); 476 477 BuildMI(MBB, I, DL, SMovB32, Rsrc0) 478 .addExternalSymbol("SCRATCH_RSRC_DWORD0") 479 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 480 481 BuildMI(MBB, I, DL, SMovB32, Rsrc1) 482 .addExternalSymbol("SCRATCH_RSRC_DWORD1") 483 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 484 485 } 486 487 BuildMI(MBB, I, DL, SMovB32, Rsrc2) 488 .addImm(Rsrc23 & 0xffffffff) 489 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 490 491 BuildMI(MBB, I, DL, SMovB32, Rsrc3) 492 .addImm(Rsrc23 >> 32) 493 .addReg(ScratchRsrcReg, RegState::ImplicitDefine); 494 } 495 } 496 497 // Find a scratch register that we can use at the start of the prologue to 498 // re-align the stack pointer. We avoid using callee-save registers since they 499 // may appear to be free when this is called from canUseAsPrologue (during 500 // shrink wrapping), but then no longer be free when this is called from 501 // emitPrologue. 502 // 503 // FIXME: This is a bit conservative, since in the above case we could use one 504 // of the callee-save registers as a scratch temp to re-align the stack pointer, 505 // but we would then have to make sure that we were in fact saving at least one 506 // callee-save register in the prologue, which is additional complexity that 507 // doesn't seem worth the benefit. 508 static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) { 509 MachineFunction *MF = MBB.getParent(); 510 511 const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>(); 512 const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo(); 513 LivePhysRegs LiveRegs(TRI); 514 LiveRegs.addLiveIns(MBB); 515 516 // Mark callee saved registers as used so we will not choose them. 517 const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF); 518 for (unsigned i = 0; CSRegs[i]; ++i) 519 LiveRegs.addReg(CSRegs[i]); 520 521 MachineRegisterInfo &MRI = MF->getRegInfo(); 522 523 for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) { 524 if (LiveRegs.available(MRI, Reg)) 525 return Reg; 526 } 527 528 return AMDGPU::NoRegister; 529 } 530 531 void SIFrameLowering::emitPrologue(MachineFunction &MF, 532 MachineBasicBlock &MBB) const { 533 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 534 if (FuncInfo->isEntryFunction()) { 535 emitEntryFunctionPrologue(MF, MBB); 536 return; 537 } 538 539 const MachineFrameInfo &MFI = MF.getFrameInfo(); 540 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 541 const SIInstrInfo *TII = ST.getInstrInfo(); 542 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 543 544 unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 545 unsigned FramePtrReg = FuncInfo->getFrameOffsetReg(); 546 547 MachineBasicBlock::iterator MBBI = MBB.begin(); 548 DebugLoc DL; 549 550 // XXX - Is this the right predicate? 551 552 bool NeedFP = hasFP(MF); 553 uint32_t NumBytes = MFI.getStackSize(); 554 uint32_t RoundedSize = NumBytes; 555 const bool NeedsRealignment = TRI.needsStackRealignment(MF); 556 557 if (NeedsRealignment) { 558 assert(NeedFP); 559 const unsigned Alignment = MFI.getMaxAlignment(); 560 561 RoundedSize += Alignment; 562 563 unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB); 564 assert(ScratchSPReg != AMDGPU::NoRegister); 565 566 // s_add_u32 tmp_reg, s32, NumBytes 567 // s_and_b32 s32, tmp_reg, 0b111...0000 568 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg) 569 .addReg(StackPtrReg) 570 .addImm((Alignment - 1) * ST.getWavefrontSize()) 571 .setMIFlag(MachineInstr::FrameSetup); 572 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) 573 .addReg(ScratchSPReg, RegState::Kill) 574 .addImm(-Alignment * ST.getWavefrontSize()) 575 .setMIFlag(MachineInstr::FrameSetup); 576 FuncInfo->setIsStackRealigned(true); 577 } else if (NeedFP) { 578 // If we need a base pointer, set it up here. It's whatever the value of 579 // the stack pointer is at this point. Any variable size objects will be 580 // allocated after this, so we can still use the base pointer to reference 581 // locals. 582 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) 583 .addReg(StackPtrReg) 584 .setMIFlag(MachineInstr::FrameSetup); 585 } 586 587 if (RoundedSize != 0 && hasSP(MF)) { 588 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) 589 .addReg(StackPtrReg) 590 .addImm(RoundedSize * ST.getWavefrontSize()) 591 .setMIFlag(MachineInstr::FrameSetup); 592 } 593 594 for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg 595 : FuncInfo->getSGPRSpillVGPRs()) { 596 if (!Reg.FI.hasValue()) 597 continue; 598 TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true, 599 Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass, 600 &TII->getRegisterInfo()); 601 } 602 } 603 604 void SIFrameLowering::emitEpilogue(MachineFunction &MF, 605 MachineBasicBlock &MBB) const { 606 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 607 if (FuncInfo->isEntryFunction()) 608 return; 609 610 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 611 const SIInstrInfo *TII = ST.getInstrInfo(); 612 MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); 613 614 for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg 615 : FuncInfo->getSGPRSpillVGPRs()) { 616 if (!Reg.FI.hasValue()) 617 continue; 618 TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR, 619 Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass, 620 &TII->getRegisterInfo()); 621 } 622 623 unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); 624 if (StackPtrReg == AMDGPU::NoRegister) 625 return; 626 627 const MachineFrameInfo &MFI = MF.getFrameInfo(); 628 uint32_t NumBytes = MFI.getStackSize(); 629 630 DebugLoc DL; 631 632 // FIXME: Clarify distinction between no set SP and SP. For callee functions, 633 // it's really whether we need SP to be accurate or not. 634 635 if (NumBytes != 0 && hasSP(MF)) { 636 uint32_t RoundedSize = FuncInfo->isStackRealigned() ? 637 NumBytes + MFI.getMaxAlignment() : NumBytes; 638 639 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) 640 .addReg(StackPtrReg) 641 .addImm(RoundedSize * ST.getWavefrontSize()); 642 } 643 } 644 645 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { 646 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); 647 I != E; ++I) { 648 if (!MFI.isDeadObjectIndex(I)) 649 return false; 650 } 651 652 return true; 653 } 654 655 int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, 656 unsigned &FrameReg) const { 657 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); 658 659 FrameReg = RI->getFrameRegister(MF); 660 return MF.getFrameInfo().getObjectOffset(FI); 661 } 662 663 void SIFrameLowering::processFunctionBeforeFrameFinalized( 664 MachineFunction &MF, 665 RegScavenger *RS) const { 666 MachineFrameInfo &MFI = MF.getFrameInfo(); 667 668 if (!MFI.hasStackObjects()) 669 return; 670 671 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 672 const SIInstrInfo *TII = ST.getInstrInfo(); 673 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 674 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 675 bool AllSGPRSpilledToVGPRs = false; 676 677 if (TRI.spillSGPRToVGPR() && FuncInfo->hasSpilledSGPRs()) { 678 AllSGPRSpilledToVGPRs = true; 679 680 // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs 681 // are spilled to VGPRs, in which case we can eliminate the stack usage. 682 // 683 // XXX - This operates under the assumption that only other SGPR spills are 684 // users of the frame index. I'm not 100% sure this is correct. The 685 // StackColoring pass has a comment saying a future improvement would be to 686 // merging of allocas with spill slots, but for now according to 687 // MachineFrameInfo isSpillSlot can't alias any other object. 688 for (MachineBasicBlock &MBB : MF) { 689 MachineBasicBlock::iterator Next; 690 for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) { 691 MachineInstr &MI = *I; 692 Next = std::next(I); 693 694 if (TII->isSGPRSpill(MI)) { 695 int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); 696 assert(MFI.getStackID(FI) == SIStackID::SGPR_SPILL); 697 if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) { 698 bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS); 699 (void)Spilled; 700 assert(Spilled && "failed to spill SGPR to VGPR when allocated"); 701 } else 702 AllSGPRSpilledToVGPRs = false; 703 } 704 } 705 } 706 707 FuncInfo->removeSGPRToVGPRFrameIndices(MFI); 708 } 709 710 // FIXME: The other checks should be redundant with allStackObjectsAreDead, 711 // but currently hasNonSpillStackObjects is set only from source 712 // allocas. Stack temps produced from legalization are not counted currently. 713 if (FuncInfo->hasNonSpillStackObjects() || FuncInfo->hasSpilledVGPRs() || 714 !AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) { 715 assert(RS && "RegScavenger required if spilling"); 716 717 // We force this to be at offset 0 so no user object ever has 0 as an 718 // address, so we may use 0 as an invalid pointer value. This is because 719 // LLVM assumes 0 is an invalid pointer in address space 0. Because alloca 720 // is required to be address space 0, we are forced to accept this for 721 // now. Ideally we could have the stack in another address space with 0 as a 722 // valid pointer, and -1 as the null value. 723 // 724 // This will also waste additional space when user stack objects require > 4 725 // byte alignment. 726 // 727 // The main cost here is losing the offset for addressing modes. However 728 // this also ensures we shouldn't need a register for the offset when 729 // emergency scavenging. 730 int ScavengeFI = MFI.CreateFixedObject( 731 TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false); 732 RS->addScavengingFrameIndex(ScavengeFI); 733 } 734 } 735 736 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, 737 RegScavenger *RS) const { 738 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); 739 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 740 741 // The SP is specifically managed and we don't want extra spills of it. 742 SavedRegs.reset(MFI->getStackPtrOffsetReg()); 743 } 744 745 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( 746 MachineFunction &MF, 747 MachineBasicBlock &MBB, 748 MachineBasicBlock::iterator I) const { 749 int64_t Amount = I->getOperand(0).getImm(); 750 if (Amount == 0) 751 return MBB.erase(I); 752 753 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 754 const SIInstrInfo *TII = ST.getInstrInfo(); 755 const DebugLoc &DL = I->getDebugLoc(); 756 unsigned Opc = I->getOpcode(); 757 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); 758 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; 759 760 const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); 761 if (!TFI->hasReservedCallFrame(MF)) { 762 unsigned Align = getStackAlignment(); 763 764 Amount = alignTo(Amount, Align); 765 assert(isUInt<32>(Amount) && "exceeded stack address space size"); 766 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 767 unsigned SPReg = MFI->getStackPtrOffsetReg(); 768 769 unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 770 BuildMI(MBB, I, DL, TII->get(Op), SPReg) 771 .addReg(SPReg) 772 .addImm(Amount * ST.getWavefrontSize()); 773 } else if (CalleePopAmount != 0) { 774 llvm_unreachable("is this used?"); 775 } 776 777 return MBB.erase(I); 778 } 779 780 void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF, 781 MachineBasicBlock &MBB) const { 782 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 783 const SIInstrInfo *TII = ST.getInstrInfo(); 784 const SIRegisterInfo *TRI = &TII->getRegisterInfo(); 785 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 786 787 MachineBasicBlock::iterator I = MBB.begin(); 788 DebugLoc DL; 789 790 // For each dimension: 791 for (unsigned i = 0; i < 3; ++i) { 792 // Get work group ID SGPR, and make it live-in again. 793 unsigned WorkGroupIDSGPR = MFI->getWorkGroupIDSGPR(i); 794 MF.getRegInfo().addLiveIn(WorkGroupIDSGPR); 795 MBB.addLiveIn(WorkGroupIDSGPR); 796 797 // Since SGPRs are spilled into VGPRs, copy work group ID SGPR to VGPR in 798 // order to spill it to scratch. 799 unsigned WorkGroupIDVGPR = 800 MF.getRegInfo().createVirtualRegister(&AMDGPU::VGPR_32RegClass); 801 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), WorkGroupIDVGPR) 802 .addReg(WorkGroupIDSGPR); 803 804 // Spill work group ID. 805 int WorkGroupIDObjectIdx = MFI->getDebuggerWorkGroupIDStackObjectIndex(i); 806 TII->storeRegToStackSlot(MBB, I, WorkGroupIDVGPR, false, 807 WorkGroupIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); 808 809 // Get work item ID VGPR, and make it live-in again. 810 unsigned WorkItemIDVGPR = MFI->getWorkItemIDVGPR(i); 811 MF.getRegInfo().addLiveIn(WorkItemIDVGPR); 812 MBB.addLiveIn(WorkItemIDVGPR); 813 814 // Spill work item ID. 815 int WorkItemIDObjectIdx = MFI->getDebuggerWorkItemIDStackObjectIndex(i); 816 TII->storeRegToStackSlot(MBB, I, WorkItemIDVGPR, false, 817 WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); 818 } 819 } 820 821 bool SIFrameLowering::hasFP(const MachineFunction &MF) const { 822 // All stack operations are relative to the frame offset SGPR. 823 // TODO: Still want to eliminate sometimes. 824 const MachineFrameInfo &MFI = MF.getFrameInfo(); 825 826 // XXX - Is this only called after frame is finalized? Should be able to check 827 // frame size. 828 return MFI.hasStackObjects() && !allStackObjectsAreDead(MFI); 829 } 830 831 bool SIFrameLowering::hasSP(const MachineFunction &MF) const { 832 const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); 833 // All stack operations are relative to the frame offset SGPR. 834 const MachineFrameInfo &MFI = MF.getFrameInfo(); 835 return MFI.hasCalls() || MFI.hasVarSizedObjects() || TRI->needsStackRealignment(MF); 836 } 837