1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief The AMDGPU target machine contains all of the hardware specific 12 /// information needed to emit code for R600 and SI GPUs. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPU.h" 18 #include "AMDGPUCallLowering.h" 19 #include "AMDGPUTargetObjectFile.h" 20 #include "AMDGPUTargetTransformInfo.h" 21 #include "R600ISelLowering.h" 22 #include "R600InstrInfo.h" 23 #include "R600MachineScheduler.h" 24 #include "SIISelLowering.h" 25 #include "SIInstrInfo.h" 26 27 #include "llvm/Analysis/Passes.h" 28 #include "llvm/CodeGen/GlobalISel/IRTranslator.h" 29 #include "llvm/CodeGen/MachineFunctionAnalysis.h" 30 #include "llvm/CodeGen/MachineModuleInfo.h" 31 #include "llvm/CodeGen/Passes.h" 32 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" 33 #include "llvm/CodeGen/TargetPassConfig.h" 34 #include "llvm/IR/Verifier.h" 35 #include "llvm/MC/MCAsmInfo.h" 36 #include "llvm/IR/LegacyPassManager.h" 37 #include "llvm/Support/TargetRegistry.h" 38 #include "llvm/Support/raw_os_ostream.h" 39 #include "llvm/Transforms/IPO.h" 40 #include "llvm/Transforms/Scalar.h" 41 #include "llvm/Transforms/Scalar/GVN.h" 42 #include "llvm/Transforms/Vectorize.h" 43 44 using namespace llvm; 45 46 static cl::opt<bool> EnableR600StructurizeCFG( 47 "r600-ir-structurize", 48 cl::desc("Use StructurizeCFG IR pass"), 49 cl::init(true)); 50 51 static cl::opt<bool> EnableSROA( 52 "amdgpu-sroa", 53 cl::desc("Run SROA after promote alloca pass"), 54 cl::ReallyHidden, 55 cl::init(true)); 56 57 static cl::opt<bool> EnableR600IfConvert( 58 "r600-if-convert", 59 cl::desc("Use if conversion pass"), 60 cl::ReallyHidden, 61 cl::init(true)); 62 63 // Option to disable vectorizer for tests. 64 static cl::opt<bool> EnableLoadStoreVectorizer( 65 "amdgpu-load-store-vectorizer", 66 cl::desc("Enable load store vectorizer"), 67 cl::init(false), 68 cl::Hidden); 69 70 extern "C" void LLVMInitializeAMDGPUTarget() { 71 // Register the target 72 RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget); 73 RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget); 74 75 PassRegistry *PR = PassRegistry::getPassRegistry(); 76 initializeSILowerI1CopiesPass(*PR); 77 initializeSIFixSGPRCopiesPass(*PR); 78 initializeSIFoldOperandsPass(*PR); 79 initializeSIShrinkInstructionsPass(*PR); 80 initializeSIFixControlFlowLiveIntervalsPass(*PR); 81 initializeSILoadStoreOptimizerPass(*PR); 82 initializeAMDGPUAnnotateKernelFeaturesPass(*PR); 83 initializeAMDGPUAnnotateUniformValuesPass(*PR); 84 initializeAMDGPUPromoteAllocaPass(*PR); 85 initializeAMDGPUCodeGenPreparePass(*PR); 86 initializeSIAnnotateControlFlowPass(*PR); 87 initializeSIDebuggerInsertNopsPass(*PR); 88 initializeSIInsertWaitsPass(*PR); 89 initializeSIWholeQuadModePass(*PR); 90 initializeSILowerControlFlowPass(*PR); 91 initializeSIDebuggerInsertNopsPass(*PR); 92 } 93 94 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { 95 return make_unique<AMDGPUTargetObjectFile>(); 96 } 97 98 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { 99 return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>()); 100 } 101 102 static MachineSchedRegistry 103 R600SchedRegistry("r600", "Run R600's custom scheduler", 104 createR600MachineScheduler); 105 106 static MachineSchedRegistry 107 SISchedRegistry("si", "Run SI's custom scheduler", 108 createSIMachineScheduler); 109 110 static StringRef computeDataLayout(const Triple &TT) { 111 if (TT.getArch() == Triple::r600) { 112 // 32-bit pointers. 113 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 114 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; 115 } 116 117 // 32-bit private, local, and region pointers. 64-bit global, constant and 118 // flat. 119 return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" 120 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 121 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; 122 } 123 124 LLVM_READNONE 125 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { 126 if (!GPU.empty()) 127 return GPU; 128 129 // HSA only supports CI+, so change the default GPU to a CI for HSA. 130 if (TT.getArch() == Triple::amdgcn) 131 return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti"; 132 133 return "r600"; 134 } 135 136 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { 137 // The AMDGPU toolchain only supports generating shared objects, so we 138 // must always use PIC. 139 return Reloc::PIC_; 140 } 141 142 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, 143 StringRef CPU, StringRef FS, 144 TargetOptions Options, 145 Optional<Reloc::Model> RM, 146 CodeModel::Model CM, 147 CodeGenOpt::Level OptLevel) 148 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), 149 FS, Options, getEffectiveRelocModel(RM), CM, OptLevel), 150 TLOF(createTLOF(getTargetTriple())), 151 IntrinsicInfo() { 152 setRequiresStructuredCFG(true); 153 initAsmInfo(); 154 } 155 156 AMDGPUTargetMachine::~AMDGPUTargetMachine() { } 157 158 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { 159 Attribute GPUAttr = F.getFnAttribute("target-cpu"); 160 return GPUAttr.hasAttribute(Attribute::None) ? 161 getTargetCPU() : GPUAttr.getValueAsString(); 162 } 163 164 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { 165 Attribute FSAttr = F.getFnAttribute("target-features"); 166 167 return FSAttr.hasAttribute(Attribute::None) ? 168 getTargetFeatureString() : 169 FSAttr.getValueAsString(); 170 } 171 172 //===----------------------------------------------------------------------===// 173 // R600 Target Machine (R600 -> Cayman) 174 //===----------------------------------------------------------------------===// 175 176 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, 177 StringRef CPU, StringRef FS, 178 TargetOptions Options, 179 Optional<Reloc::Model> RM, 180 CodeModel::Model CM, CodeGenOpt::Level OL) 181 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 182 183 const R600Subtarget *R600TargetMachine::getSubtargetImpl( 184 const Function &F) const { 185 StringRef GPU = getGPUName(F); 186 StringRef FS = getFeatureString(F); 187 188 SmallString<128> SubtargetKey(GPU); 189 SubtargetKey.append(FS); 190 191 auto &I = SubtargetMap[SubtargetKey]; 192 if (!I) { 193 // This needs to be done before we create a new subtarget since any 194 // creation will depend on the TM and the code generation flags on the 195 // function that reside in TargetOptions. 196 resetTargetOptions(F); 197 I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this); 198 } 199 200 return I.get(); 201 } 202 203 //===----------------------------------------------------------------------===// 204 // GCN Target Machine (SI+) 205 //===----------------------------------------------------------------------===// 206 207 #ifdef LLVM_BUILD_GLOBAL_ISEL 208 namespace { 209 struct SIGISelActualAccessor : public GISelAccessor { 210 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 211 const AMDGPUCallLowering *getCallLowering() const override { 212 return CallLoweringInfo.get(); 213 } 214 }; 215 } // End anonymous namespace. 216 #endif 217 218 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, 219 StringRef CPU, StringRef FS, 220 TargetOptions Options, 221 Optional<Reloc::Model> RM, 222 CodeModel::Model CM, CodeGenOpt::Level OL) 223 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 224 225 const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { 226 StringRef GPU = getGPUName(F); 227 StringRef FS = getFeatureString(F); 228 229 SmallString<128> SubtargetKey(GPU); 230 SubtargetKey.append(FS); 231 232 auto &I = SubtargetMap[SubtargetKey]; 233 if (!I) { 234 // This needs to be done before we create a new subtarget since any 235 // creation will depend on the TM and the code generation flags on the 236 // function that reside in TargetOptions. 237 resetTargetOptions(F); 238 I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this); 239 240 #ifndef LLVM_BUILD_GLOBAL_ISEL 241 GISelAccessor *GISel = new GISelAccessor(); 242 #else 243 SIGISelActualAccessor *GISel = new SIGISelActualAccessor(); 244 GISel->CallLoweringInfo.reset( 245 new AMDGPUCallLowering(*I->getTargetLowering())); 246 #endif 247 248 I->setGISelAccessor(*GISel); 249 } 250 251 return I.get(); 252 } 253 254 //===----------------------------------------------------------------------===// 255 // AMDGPU Pass Setup 256 //===----------------------------------------------------------------------===// 257 258 namespace { 259 260 class AMDGPUPassConfig : public TargetPassConfig { 261 public: 262 AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) 263 : TargetPassConfig(TM, PM) { 264 265 // Exceptions and StackMaps are not supported, so these passes will never do 266 // anything. 267 disablePass(&StackMapLivenessID); 268 disablePass(&FuncletLayoutID); 269 } 270 271 AMDGPUTargetMachine &getAMDGPUTargetMachine() const { 272 return getTM<AMDGPUTargetMachine>(); 273 } 274 275 void addEarlyCSEOrGVNPass(); 276 void addStraightLineScalarOptimizationPasses(); 277 void addIRPasses() override; 278 void addCodeGenPrepare() override; 279 bool addPreISel() override; 280 bool addInstSelector() override; 281 bool addGCPasses() override; 282 }; 283 284 class R600PassConfig final : public AMDGPUPassConfig { 285 public: 286 R600PassConfig(TargetMachine *TM, PassManagerBase &PM) 287 : AMDGPUPassConfig(TM, PM) { } 288 289 ScheduleDAGInstrs *createMachineScheduler( 290 MachineSchedContext *C) const override { 291 return createR600MachineScheduler(C); 292 } 293 294 bool addPreISel() override; 295 void addPreRegAlloc() override; 296 void addPreSched2() override; 297 void addPreEmitPass() override; 298 }; 299 300 class GCNPassConfig final : public AMDGPUPassConfig { 301 public: 302 GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) 303 : AMDGPUPassConfig(TM, PM) { } 304 305 GCNTargetMachine &getGCNTargetMachine() const { 306 return getTM<GCNTargetMachine>(); 307 } 308 309 ScheduleDAGInstrs * 310 createMachineScheduler(MachineSchedContext *C) const override; 311 312 bool addPreISel() override; 313 void addMachineSSAOptimization() override; 314 bool addInstSelector() override; 315 #ifdef LLVM_BUILD_GLOBAL_ISEL 316 bool addIRTranslator() override; 317 bool addRegBankSelect() override; 318 #endif 319 void addFastRegAlloc(FunctionPass *RegAllocPass) override; 320 void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; 321 void addPreRegAlloc() override; 322 void addPreSched2() override; 323 void addPreEmitPass() override; 324 }; 325 326 } // End of anonymous namespace 327 328 TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { 329 return TargetIRAnalysis([this](const Function &F) { 330 return TargetTransformInfo(AMDGPUTTIImpl(this, F)); 331 }); 332 } 333 334 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { 335 if (getOptLevel() == CodeGenOpt::Aggressive) 336 addPass(createGVNPass()); 337 else 338 addPass(createEarlyCSEPass()); 339 } 340 341 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { 342 addPass(createSeparateConstOffsetFromGEPPass()); 343 addPass(createSpeculativeExecutionPass()); 344 // ReassociateGEPs exposes more opportunites for SLSR. See 345 // the example in reassociate-geps-and-slsr.ll. 346 addPass(createStraightLineStrengthReducePass()); 347 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 348 // EarlyCSE can reuse. 349 addEarlyCSEOrGVNPass(); 350 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 351 addPass(createNaryReassociatePass()); 352 // NaryReassociate on GEPs creates redundant common expressions, so run 353 // EarlyCSE after it. 354 addPass(createEarlyCSEPass()); 355 } 356 357 void AMDGPUPassConfig::addIRPasses() { 358 // There is no reason to run these. 359 disablePass(&StackMapLivenessID); 360 disablePass(&FuncletLayoutID); 361 disablePass(&PatchableFunctionID); 362 363 // Function calls are not supported, so make sure we inline everything. 364 addPass(createAMDGPUAlwaysInlinePass()); 365 addPass(createAlwaysInlinerPass()); 366 // We need to add the barrier noop pass, otherwise adding the function 367 // inlining pass will cause all of the PassConfigs passes to be run 368 // one function at a time, which means if we have a nodule with two 369 // functions, then we will generate code for the first function 370 // without ever running any passes on the second. 371 addPass(createBarrierNoopPass()); 372 373 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. 374 addPass(createAMDGPUOpenCLImageTypeLoweringPass()); 375 376 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); 377 if (TM.getOptLevel() > CodeGenOpt::None) { 378 addPass(createAMDGPUPromoteAlloca(&TM)); 379 380 if (EnableSROA) 381 addPass(createSROAPass()); 382 } 383 384 addStraightLineScalarOptimizationPasses(); 385 386 TargetPassConfig::addIRPasses(); 387 388 // EarlyCSE is not always strong enough to clean up what LSR produces. For 389 // example, GVN can combine 390 // 391 // %0 = add %a, %b 392 // %1 = add %b, %a 393 // 394 // and 395 // 396 // %0 = shl nsw %a, 2 397 // %1 = shl %a, 2 398 // 399 // but EarlyCSE can do neither of them. 400 if (getOptLevel() != CodeGenOpt::None) 401 addEarlyCSEOrGVNPass(); 402 } 403 404 void AMDGPUPassConfig::addCodeGenPrepare() { 405 TargetPassConfig::addCodeGenPrepare(); 406 407 if (EnableLoadStoreVectorizer) 408 addPass(createLoadStoreVectorizerPass()); 409 } 410 411 bool AMDGPUPassConfig::addPreISel() { 412 addPass(createFlattenCFGPass()); 413 return false; 414 } 415 416 bool AMDGPUPassConfig::addInstSelector() { 417 addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); 418 return false; 419 } 420 421 bool AMDGPUPassConfig::addGCPasses() { 422 // Do nothing. GC is not supported. 423 return false; 424 } 425 426 //===----------------------------------------------------------------------===// 427 // R600 Pass Setup 428 //===----------------------------------------------------------------------===// 429 430 bool R600PassConfig::addPreISel() { 431 AMDGPUPassConfig::addPreISel(); 432 433 if (EnableR600StructurizeCFG) 434 addPass(createStructurizeCFGPass()); 435 return false; 436 } 437 438 void R600PassConfig::addPreRegAlloc() { 439 addPass(createR600VectorRegMerger(*TM)); 440 } 441 442 void R600PassConfig::addPreSched2() { 443 addPass(createR600EmitClauseMarkers(), false); 444 if (EnableR600IfConvert) 445 addPass(&IfConverterID, false); 446 addPass(createR600ClauseMergePass(*TM), false); 447 } 448 449 void R600PassConfig::addPreEmitPass() { 450 addPass(createAMDGPUCFGStructurizerPass(), false); 451 addPass(createR600ExpandSpecialInstrsPass(*TM), false); 452 addPass(&FinalizeMachineBundlesID, false); 453 addPass(createR600Packetizer(*TM), false); 454 addPass(createR600ControlFlowFinalizer(*TM), false); 455 } 456 457 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { 458 return new R600PassConfig(this, PM); 459 } 460 461 //===----------------------------------------------------------------------===// 462 // GCN Pass Setup 463 //===----------------------------------------------------------------------===// 464 465 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( 466 MachineSchedContext *C) const { 467 const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>(); 468 if (ST.enableSIScheduler()) 469 return createSIMachineScheduler(C); 470 return nullptr; 471 } 472 473 bool GCNPassConfig::addPreISel() { 474 AMDGPUPassConfig::addPreISel(); 475 476 // FIXME: We need to run a pass to propagate the attributes when calls are 477 // supported. 478 addPass(&AMDGPUAnnotateKernelFeaturesID); 479 addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions 480 addPass(createSinkingPass()); 481 addPass(createSITypeRewriter()); 482 addPass(createAMDGPUAnnotateUniformValues()); 483 addPass(createSIAnnotateControlFlowPass()); 484 485 return false; 486 } 487 488 void GCNPassConfig::addMachineSSAOptimization() { 489 TargetPassConfig::addMachineSSAOptimization(); 490 491 // We want to fold operands after PeepholeOptimizer has run (or as part of 492 // it), because it will eliminate extra copies making it easier to fold the 493 // real source operand. We want to eliminate dead instructions after, so that 494 // we see fewer uses of the copies. We then need to clean up the dead 495 // instructions leftover after the operands are folded as well. 496 // 497 // XXX - Can we get away without running DeadMachineInstructionElim again? 498 addPass(&SIFoldOperandsID); 499 addPass(&DeadMachineInstructionElimID); 500 } 501 502 bool GCNPassConfig::addInstSelector() { 503 AMDGPUPassConfig::addInstSelector(); 504 addPass(createSILowerI1CopiesPass()); 505 addPass(&SIFixSGPRCopiesID); 506 return false; 507 } 508 509 #ifdef LLVM_BUILD_GLOBAL_ISEL 510 bool GCNPassConfig::addIRTranslator() { 511 addPass(new IRTranslator()); 512 return false; 513 } 514 515 bool GCNPassConfig::addRegBankSelect() { 516 return false; 517 } 518 #endif 519 520 void GCNPassConfig::addPreRegAlloc() { 521 // This needs to be run directly before register allocation because 522 // earlier passes might recompute live intervals. 523 // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass 524 if (getOptLevel() > CodeGenOpt::None) { 525 insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); 526 } 527 528 if (getOptLevel() > CodeGenOpt::None) { 529 // Don't do this with no optimizations since it throws away debug info by 530 // merging nonadjacent loads. 531 532 // This should be run after scheduling, but before register allocation. It 533 // also need extra copies to the address operand to be eliminated. 534 535 // FIXME: Move pre-RA and remove extra reg coalescer run. 536 insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); 537 insertPass(&MachineSchedulerID, &RegisterCoalescerID); 538 } 539 540 addPass(createSIShrinkInstructionsPass()); 541 addPass(createSIWholeQuadModePass()); 542 } 543 544 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { 545 TargetPassConfig::addFastRegAlloc(RegAllocPass); 546 } 547 548 void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { 549 TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); 550 } 551 552 void GCNPassConfig::addPreSched2() { 553 } 554 555 void GCNPassConfig::addPreEmitPass() { 556 // The hazard recognizer that runs as part of the post-ra scheduler does not 557 // guarantee to be able handle all hazards correctly. This is because if there 558 // are multiple scheduling regions in a basic block, the regions are scheduled 559 // bottom up, so when we begin to schedule a region we don't know what 560 // instructions were emitted directly before it. 561 // 562 // Here we add a stand-alone hazard recognizer pass which can handle all 563 // cases. 564 addPass(&PostRAHazardRecognizerID); 565 566 addPass(createSIInsertWaitsPass()); 567 addPass(createSIShrinkInstructionsPass()); 568 addPass(createSILowerControlFlowPass()); 569 addPass(createSIDebuggerInsertNopsPass()); 570 } 571 572 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { 573 return new GCNPassConfig(this, PM); 574 } 575