1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "AMDGPUCallLowering.h" 19 #include "AMDGPUInstructionSelector.h" 20 #include "AMDGPULegalizerInfo.h" 21 #include "AMDGPURegisterBankInfo.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 24 #include "llvm/ADT/SmallString.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/MC/MCSubtargetInfo.h" 27 #include "llvm/IR/MDBuilder.h" 28 #include "llvm/CodeGen/TargetFrameLowering.h" 29 #include <algorithm> 30 31 using namespace llvm; 32 33 #define DEBUG_TYPE "amdgpu-subtarget" 34 35 #define GET_SUBTARGETINFO_TARGET_DESC 36 #define GET_SUBTARGETINFO_CTOR 37 #define AMDGPUSubtarget GCNSubtarget 38 #include "AMDGPUGenSubtargetInfo.inc" 39 #define GET_SUBTARGETINFO_TARGET_DESC 40 #define GET_SUBTARGETINFO_CTOR 41 #undef AMDGPUSubtarget 42 #include "R600GenSubtargetInfo.inc" 43 44 GCNSubtarget::~GCNSubtarget() = default; 45 46 R600Subtarget & 47 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 48 StringRef GPU, StringRef FS) { 49 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,"); 50 FullFS += FS; 51 ParseSubtargetFeatures(GPU, FullFS); 52 53 // FIXME: I don't think think Evergreen has any useful support for 54 // denormals, but should be checked. Should we issue a warning somewhere 55 // if someone tries to enable these? 56 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 57 FP32Denormals = false; 58 } 59 60 HasMulU24 = getGeneration() >= EVERGREEN; 61 HasMulI24 = hasCaymanISA(); 62 63 return *this; 64 } 65 66 GCNSubtarget & 67 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 68 StringRef GPU, StringRef FS) { 69 // Determine default and user-specified characteristics 70 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 71 // enabled, but some instructions do not respect them and they run at the 72 // double precision rate, so don't enable by default. 73 // 74 // We want to be able to turn these off, but making this a subtarget feature 75 // for SI has the unhelpful behavior that it unsets everything else if you 76 // disable it. 77 78 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,"); 79 80 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 81 FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 82 83 // FIXME: I don't think think Evergreen has any useful support for 84 // denormals, but should be checked. Should we issue a warning somewhere 85 // if someone tries to enable these? 86 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 87 FullFS += "+fp64-fp16-denormals,"; 88 } else { 89 FullFS += "-fp32-denormals,"; 90 } 91 92 FullFS += FS; 93 94 ParseSubtargetFeatures(GPU, FullFS); 95 96 // We don't support FP64 for EG/NI atm. 97 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 98 99 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 100 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 101 // variants of MUBUF instructions. 102 if (!hasAddr64() && !FS.contains("flat-for-global")) { 103 FlatForGlobal = true; 104 } 105 106 // Set defaults if needed. 107 if (MaxPrivateElementSize == 0) 108 MaxPrivateElementSize = 4; 109 110 if (LDSBankCount == 0) 111 LDSBankCount = 32; 112 113 if (TT.getArch() == Triple::amdgcn) { 114 if (LocalMemorySize == 0) 115 LocalMemorySize = 32768; 116 117 // Do something sensible for unspecified target. 118 if (!HasMovrel && !HasVGPRIndexMode) 119 HasMovrel = true; 120 } 121 122 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 123 124 return *this; 125 } 126 127 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, 128 const FeatureBitset &FeatureBits) : 129 TargetTriple(TT), 130 SubtargetFeatureBits(FeatureBits), 131 Has16BitInsts(false), 132 HasMadMixInsts(false), 133 FP32Denormals(false), 134 FPExceptions(false), 135 HasSDWA(false), 136 HasVOP3PInsts(false), 137 HasMulI24(true), 138 HasMulU24(true), 139 HasFminFmaxLegacy(true), 140 EnablePromoteAlloca(false), 141 LocalMemorySize(0), 142 WavefrontSize(0) 143 { } 144 145 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 146 const GCNTargetMachine &TM) : 147 AMDGPUGenSubtargetInfo(TT, GPU, FS), 148 AMDGPUSubtarget(TT, getFeatureBits()), 149 TargetTriple(TT), 150 Gen(SOUTHERN_ISLANDS), 151 IsaVersion(ISAVersion0_0_0), 152 LDSBankCount(0), 153 MaxPrivateElementSize(0), 154 155 FastFMAF32(false), 156 HalfRate64Ops(false), 157 158 FP64FP16Denormals(false), 159 DX10Clamp(false), 160 FlatForGlobal(false), 161 AutoWaitcntBeforeBarrier(false), 162 CodeObjectV3(false), 163 UnalignedScratchAccess(false), 164 UnalignedBufferAccess(false), 165 166 HasApertureRegs(false), 167 EnableXNACK(false), 168 TrapHandler(false), 169 DebuggerInsertNops(false), 170 DebuggerEmitPrologue(false), 171 172 EnableHugePrivateBuffer(false), 173 EnableVGPRSpilling(false), 174 EnableLoadStoreOpt(false), 175 EnableUnsafeDSOffsetFolding(false), 176 EnableSIScheduler(false), 177 EnableDS128(false), 178 DumpCode(false), 179 180 FP64(false), 181 GCN3Encoding(false), 182 CIInsts(false), 183 GFX9Insts(false), 184 SGPRInitBug(false), 185 HasSMemRealTime(false), 186 HasIntClamp(false), 187 HasFmaMixInsts(false), 188 HasMovrel(false), 189 HasVGPRIndexMode(false), 190 HasScalarStores(false), 191 HasScalarAtomics(false), 192 HasInv2PiInlineImm(false), 193 HasSDWAOmod(false), 194 HasSDWAScalar(false), 195 HasSDWASdst(false), 196 HasSDWAMac(false), 197 HasSDWAOutModsVOPC(false), 198 HasDPP(false), 199 HasDLInsts(false), 200 D16PreservesUnusedBits(false), 201 FlatAddressSpace(false), 202 FlatInstOffsets(false), 203 FlatGlobalInsts(false), 204 FlatScratchInsts(false), 205 AddNoCarryInsts(false), 206 HasUnpackedD16VMem(false), 207 208 ScalarizeGlobal(false), 209 210 FeatureDisable(false), 211 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 212 TLInfo(TM, *this), 213 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 214 AS = AMDGPU::getAMDGPUAS(TT); 215 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 216 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 217 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 218 InstSelector.reset(new AMDGPUInstructionSelector( 219 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 220 } 221 222 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 223 const Function &F) const { 224 if (NWaves == 1) 225 return getLocalMemorySize(); 226 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 227 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 228 unsigned MaxWaves = getMaxWavesPerEU(); 229 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 230 } 231 232 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 233 const Function &F) const { 234 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 235 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 236 unsigned MaxWaves = getMaxWavesPerEU(); 237 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 238 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 239 NumWaves = std::min(NumWaves, MaxWaves); 240 NumWaves = std::max(NumWaves, 1u); 241 return NumWaves; 242 } 243 244 unsigned 245 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 246 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 247 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 248 } 249 250 std::pair<unsigned, unsigned> 251 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 252 switch (CC) { 253 case CallingConv::AMDGPU_CS: 254 case CallingConv::AMDGPU_KERNEL: 255 case CallingConv::SPIR_KERNEL: 256 return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4); 257 case CallingConv::AMDGPU_VS: 258 case CallingConv::AMDGPU_LS: 259 case CallingConv::AMDGPU_HS: 260 case CallingConv::AMDGPU_ES: 261 case CallingConv::AMDGPU_GS: 262 case CallingConv::AMDGPU_PS: 263 return std::make_pair(1, getWavefrontSize()); 264 default: 265 return std::make_pair(1, 16 * getWavefrontSize()); 266 } 267 } 268 269 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 270 const Function &F) const { 271 // FIXME: 1024 if function. 272 // Default minimum/maximum flat work group sizes. 273 std::pair<unsigned, unsigned> Default = 274 getDefaultFlatWorkGroupSize(F.getCallingConv()); 275 276 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 277 // starts using "amdgpu-flat-work-group-size" attribute. 278 Default.second = AMDGPU::getIntegerAttribute( 279 F, "amdgpu-max-work-group-size", Default.second); 280 Default.first = std::min(Default.first, Default.second); 281 282 // Requested minimum/maximum flat work group sizes. 283 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 284 F, "amdgpu-flat-work-group-size", Default); 285 286 // Make sure requested minimum is less than requested maximum. 287 if (Requested.first > Requested.second) 288 return Default; 289 290 // Make sure requested values do not violate subtarget's specifications. 291 if (Requested.first < getMinFlatWorkGroupSize()) 292 return Default; 293 if (Requested.second > getMaxFlatWorkGroupSize()) 294 return Default; 295 296 return Requested; 297 } 298 299 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 300 const Function &F) const { 301 // Default minimum/maximum number of waves per execution unit. 302 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 303 304 // Default/requested minimum/maximum flat work group sizes. 305 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 306 307 // If minimum/maximum flat work group sizes were explicitly requested using 308 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 309 // number of waves per execution unit to values implied by requested 310 // minimum/maximum flat work group sizes. 311 unsigned MinImpliedByFlatWorkGroupSize = 312 getMaxWavesPerEU(FlatWorkGroupSizes.second); 313 bool RequestedFlatWorkGroupSize = false; 314 315 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 316 // starts using "amdgpu-flat-work-group-size" attribute. 317 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 318 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 319 Default.first = MinImpliedByFlatWorkGroupSize; 320 RequestedFlatWorkGroupSize = true; 321 } 322 323 // Requested minimum/maximum number of waves per execution unit. 324 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 325 F, "amdgpu-waves-per-eu", Default, true); 326 327 // Make sure requested minimum is less than requested maximum. 328 if (Requested.second && Requested.first > Requested.second) 329 return Default; 330 331 // Make sure requested values do not violate subtarget's specifications. 332 if (Requested.first < getMinWavesPerEU() || 333 Requested.first > getMaxWavesPerEU()) 334 return Default; 335 if (Requested.second > getMaxWavesPerEU()) 336 return Default; 337 338 // Make sure requested values are compatible with values implied by requested 339 // minimum/maximum flat work group sizes. 340 if (RequestedFlatWorkGroupSize && 341 Requested.first < MinImpliedByFlatWorkGroupSize) 342 return Default; 343 344 return Requested; 345 } 346 347 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 348 Function *Kernel = I->getParent()->getParent(); 349 unsigned MinSize = 0; 350 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 351 bool IdQuery = false; 352 353 // If reqd_work_group_size is present it narrows value down. 354 if (auto *CI = dyn_cast<CallInst>(I)) { 355 const Function *F = CI->getCalledFunction(); 356 if (F) { 357 unsigned Dim = UINT_MAX; 358 switch (F->getIntrinsicID()) { 359 case Intrinsic::amdgcn_workitem_id_x: 360 case Intrinsic::r600_read_tidig_x: 361 IdQuery = true; 362 LLVM_FALLTHROUGH; 363 case Intrinsic::r600_read_local_size_x: 364 Dim = 0; 365 break; 366 case Intrinsic::amdgcn_workitem_id_y: 367 case Intrinsic::r600_read_tidig_y: 368 IdQuery = true; 369 LLVM_FALLTHROUGH; 370 case Intrinsic::r600_read_local_size_y: 371 Dim = 1; 372 break; 373 case Intrinsic::amdgcn_workitem_id_z: 374 case Intrinsic::r600_read_tidig_z: 375 IdQuery = true; 376 LLVM_FALLTHROUGH; 377 case Intrinsic::r600_read_local_size_z: 378 Dim = 2; 379 break; 380 default: 381 break; 382 } 383 if (Dim <= 3) { 384 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 385 if (Node->getNumOperands() == 3) 386 MinSize = MaxSize = mdconst::extract<ConstantInt>( 387 Node->getOperand(Dim))->getZExtValue(); 388 } 389 } 390 } 391 392 if (!MaxSize) 393 return false; 394 395 // Range metadata is [Lo, Hi). For ID query we need to pass max size 396 // as Hi. For size query we need to pass Hi + 1. 397 if (IdQuery) 398 MinSize = 0; 399 else 400 ++MaxSize; 401 402 MDBuilder MDB(I->getContext()); 403 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 404 APInt(32, MaxSize)); 405 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 406 return true; 407 } 408 409 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 410 unsigned &MaxAlign) const { 411 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 412 F.getCallingConv() == CallingConv::SPIR_KERNEL); 413 414 const DataLayout &DL = F.getParent()->getDataLayout(); 415 uint64_t ExplicitArgBytes = 0; 416 MaxAlign = 1; 417 418 for (const Argument &Arg : F.args()) { 419 Type *ArgTy = Arg.getType(); 420 421 unsigned Align = DL.getABITypeAlignment(ArgTy); 422 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 423 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; 424 MaxAlign = std::max(MaxAlign, Align); 425 } 426 427 return ExplicitArgBytes; 428 } 429 430 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 431 unsigned &MaxAlign) const { 432 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 433 434 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 435 436 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 437 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 438 if (ImplicitBytes != 0) { 439 unsigned Alignment = getAlignmentForImplicitArgPtr(); 440 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 441 } 442 443 // Being able to dereference past the end is useful for emitting scalar loads. 444 return alignTo(TotalSize, 4); 445 } 446 447 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 448 const TargetMachine &TM) : 449 R600GenSubtargetInfo(TT, GPU, FS), 450 AMDGPUSubtarget(TT, getFeatureBits()), 451 InstrInfo(*this), 452 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 453 FMA(false), 454 CaymanISA(false), 455 CFALUBug(false), 456 DX10Clamp(false), 457 HasVertexCache(false), 458 R600ALUInst(false), 459 FP64(false), 460 TexVTXClauseSize(0), 461 Gen(R600), 462 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 463 InstrItins(getInstrItineraryForCPU(GPU)), 464 AS (AMDGPU::getAMDGPUAS(TT)) { } 465 466 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 467 unsigned NumRegionInstrs) const { 468 // Track register pressure so the scheduler can try to decrease 469 // pressure once register usage is above the threshold defined by 470 // SIRegisterInfo::getRegPressureSetLimit() 471 Policy.ShouldTrackPressure = true; 472 473 // Enabling both top down and bottom up scheduling seems to give us less 474 // register spills than just using one of these approaches on its own. 475 Policy.OnlyTopDown = false; 476 Policy.OnlyBottomUp = false; 477 478 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 479 if (!enableSIScheduler()) 480 Policy.ShouldTrackLaneMasks = true; 481 } 482 483 bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const { 484 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 485 } 486 487 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 488 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 489 if (SGPRs <= 80) 490 return 10; 491 if (SGPRs <= 88) 492 return 9; 493 if (SGPRs <= 100) 494 return 8; 495 return 7; 496 } 497 if (SGPRs <= 48) 498 return 10; 499 if (SGPRs <= 56) 500 return 9; 501 if (SGPRs <= 64) 502 return 8; 503 if (SGPRs <= 72) 504 return 7; 505 if (SGPRs <= 80) 506 return 6; 507 return 5; 508 } 509 510 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 511 if (VGPRs <= 24) 512 return 10; 513 if (VGPRs <= 28) 514 return 9; 515 if (VGPRs <= 32) 516 return 8; 517 if (VGPRs <= 36) 518 return 7; 519 if (VGPRs <= 40) 520 return 6; 521 if (VGPRs <= 48) 522 return 5; 523 if (VGPRs <= 64) 524 return 4; 525 if (VGPRs <= 84) 526 return 3; 527 if (VGPRs <= 128) 528 return 2; 529 return 1; 530 } 531 532 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 533 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 534 if (MFI.hasFlatScratchInit()) { 535 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 536 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 537 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 538 return 4; // FLAT_SCRATCH, VCC (in that order). 539 } 540 541 if (isXNACKEnabled()) 542 return 4; // XNACK, VCC (in that order). 543 return 2; // VCC. 544 } 545 546 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 547 const Function &F = MF.getFunction(); 548 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 549 550 // Compute maximum number of SGPRs function can use using default/requested 551 // minimum number of waves per execution unit. 552 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 553 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 554 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 555 556 // Check if maximum number of SGPRs was explicitly requested using 557 // "amdgpu-num-sgpr" attribute. 558 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 559 unsigned Requested = AMDGPU::getIntegerAttribute( 560 F, "amdgpu-num-sgpr", MaxNumSGPRs); 561 562 // Make sure requested value does not violate subtarget's specifications. 563 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 564 Requested = 0; 565 566 // If more SGPRs are required to support the input user/system SGPRs, 567 // increase to accommodate them. 568 // 569 // FIXME: This really ends up using the requested number of SGPRs + number 570 // of reserved special registers in total. Theoretically you could re-use 571 // the last input registers for these special registers, but this would 572 // require a lot of complexity to deal with the weird aliasing. 573 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 574 if (Requested && Requested < InputNumSGPRs) 575 Requested = InputNumSGPRs; 576 577 // Make sure requested value is compatible with values implied by 578 // default/requested minimum/maximum number of waves per execution unit. 579 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 580 Requested = 0; 581 if (WavesPerEU.second && 582 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 583 Requested = 0; 584 585 if (Requested) 586 MaxNumSGPRs = Requested; 587 } 588 589 if (hasSGPRInitBug()) 590 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 591 592 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 593 MaxAddressableNumSGPRs); 594 } 595 596 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 597 const Function &F = MF.getFunction(); 598 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 599 600 // Compute maximum number of VGPRs function can use using default/requested 601 // minimum number of waves per execution unit. 602 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 603 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 604 605 // Check if maximum number of VGPRs was explicitly requested using 606 // "amdgpu-num-vgpr" attribute. 607 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 608 unsigned Requested = AMDGPU::getIntegerAttribute( 609 F, "amdgpu-num-vgpr", MaxNumVGPRs); 610 611 // Make sure requested value is compatible with values implied by 612 // default/requested minimum/maximum number of waves per execution unit. 613 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 614 Requested = 0; 615 if (WavesPerEU.second && 616 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 617 Requested = 0; 618 619 if (Requested) 620 MaxNumVGPRs = Requested; 621 } 622 623 return MaxNumVGPRs; 624 } 625 626 namespace { 627 struct MemOpClusterMutation : ScheduleDAGMutation { 628 const SIInstrInfo *TII; 629 630 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 631 632 void apply(ScheduleDAGInstrs *DAGInstrs) override { 633 ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 634 635 SUnit *SUa = nullptr; 636 // Search for two consequent memory operations and link them 637 // to prevent scheduler from moving them apart. 638 // In DAG pre-process SUnits are in the original order of 639 // the instructions before scheduling. 640 for (SUnit &SU : DAG->SUnits) { 641 MachineInstr &MI2 = *SU.getInstr(); 642 if (!MI2.mayLoad() && !MI2.mayStore()) { 643 SUa = nullptr; 644 continue; 645 } 646 if (!SUa) { 647 SUa = &SU; 648 continue; 649 } 650 651 MachineInstr &MI1 = *SUa->getInstr(); 652 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 653 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 654 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 655 (TII->isDS(MI1) && TII->isDS(MI2))) { 656 SU.addPredBarrier(SUa); 657 658 for (const SDep &SI : SU.Preds) { 659 if (SI.getSUnit() != SUa) 660 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 661 } 662 663 if (&SU != &DAG->ExitSU) { 664 for (const SDep &SI : SUa->Succs) { 665 if (SI.getSUnit() != &SU) 666 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 667 } 668 } 669 } 670 671 SUa = &SU; 672 } 673 } 674 }; 675 } // namespace 676 677 void GCNSubtarget::getPostRAMutations( 678 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 679 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 680 } 681 682 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 683 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 684 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 685 else 686 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 687 } 688 689 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 690 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 691 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 692 else 693 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 694 } 695