1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h" 17 18 #include <map> 19 #include <memory> 20 #include <string> 21 #include <utility> 22 23 #include "tensorflow/compiler/xla/ptr_util.h" 24 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h" 25 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h" 26 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" 27 #include "tensorflow/compiler/xla/status_macros.h" 28 #include "tensorflow/compiler/xla/util.h" 29 30 #include "llvm/ADT/STLExtras.h" 31 #include "llvm/ADT/StringMap.h" 32 #include "llvm/ADT/StringSet.h" 33 #include "llvm/Analysis/TargetLibraryInfo.h" 34 #include "llvm/Analysis/TargetTransformInfo.h" 35 #include "llvm/Bitcode/BitcodeReader.h" 36 #include "llvm/Bitcode/BitcodeWriter.h" 37 #include "llvm/CodeGen/CommandFlags.def" 38 #include "llvm/IR/LLVMContext.h" 39 #include "llvm/IR/LegacyPassManager.h" 40 #include "llvm/IR/Module.h" 41 #include "llvm/IR/Verifier.h" 42 #include "llvm/Linker/Linker.h" 43 #include "llvm/PassRegistry.h" 44 #include "llvm/Support/CommandLine.h" 45 #include "llvm/Support/FileSystem.h" 46 #include "llvm/Support/FormattedStream.h" 47 #include "llvm/Support/TargetRegistry.h" 48 #include "llvm/Support/TargetSelect.h" 49 #include "llvm/Support/ToolOutputFile.h" 50 #include "llvm/Target/TargetMachine.h" 51 #include "llvm/Transforms/IPO.h" 52 #include "llvm/Transforms/IPO/AlwaysInliner.h" 53 #include "llvm/Transforms/IPO/Internalize.h" 54 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 55 #include "llvm/Transforms/Scalar.h" 56 #include "tensorflow/compiler/xla/types.h" 57 #include "tensorflow/core/lib/core/stringpiece.h" 58 #include "tensorflow/core/lib/io/path.h" 59 #include "tensorflow/core/lib/strings/str_util.h" 60 #include "tensorflow/core/lib/strings/stringprintf.h" 61 #include "tensorflow/core/platform/env.h" 62 #include "tensorflow/core/platform/logging.h" 63 #include "tensorflow/core/platform/tracing.h" 64 65 namespace xla { 66 namespace gpu { 67 namespace { 68 69 // Default inline threshold value to use in llvm. 70 const int kDefaultInlineThreshold = 1100; 71 72 // Gets the libdevice filename for a particular compute capability. When 73 // presented with a GPU we don't recognize, we just return the libdevice from 74 // compute_20. 75 static string GetLibdeviceFilename(const string& libdevice_dir_path, 76 std::pair<int, int> compute_capability) { 77 // Since CUDA 9.0, all GPU versions are included in a single file 78 const char* unified_libdevice_filename = "libdevice.10.bc"; 79 std::vector<string> unified_libdevice_files; 80 const tensorflow::Status status = 81 tensorflow::Env::Default()->GetMatchingPaths( 82 tensorflow::io::JoinPath(libdevice_dir_path, unified_libdevice_filename), 83 &unified_libdevice_files); 84 if (status.ok() && unified_libdevice_files.size() == 1) { 85 return unified_libdevice_filename; 86 } 87 // There are only four libdevice files: compute_{20,30,35,50}. Each GPU 88 // version gets mapped to one of these. Note in particular that sm_60 and 89 // sm_61 map to libdevice.compute_30. 90 static auto* m = new std::map<std::pair<int, int>, int>({{{2, 0}, 20}, 91 {{2, 1}, 20}, 92 {{3, 0}, 30}, 93 {{3, 2}, 30}, 94 {{3, 5}, 35}, 95 {{3, 7}, 35}, 96 {{5, 0}, 50}, 97 {{5, 2}, 50}, 98 {{5, 3}, 50}, 99 {{6, 0}, 30}, 100 {{6, 1}, 30}, 101 {{6, 2}, 30}}); 102 int libdevice_version = 20; 103 auto it = m->find(compute_capability); 104 if (it != m->end()) { 105 libdevice_version = it->second; 106 } else { 107 LOG(WARNING) << "Unknown compute capability (" << compute_capability.first 108 << ", " << compute_capability.second << ") ." 109 << "Defaulting to libdevice for compute_" << libdevice_version; 110 } 111 return tensorflow::strings::StrCat("libdevice.compute_", libdevice_version, 112 ".10.bc"); 113 } 114 115 // Gets the GPU name as it's known to LLVM for a given compute capability. If 116 // we see an unrecognized compute capability, we return "sm_30". 117 static string GetSmName(std::pair<int, int> compute_capability) { 118 static auto* m = new std::map<std::pair<int, int>, int>({{{2, 0}, 20}, 119 {{2, 1}, 21}, 120 {{3, 0}, 30}, 121 {{3, 2}, 32}, 122 {{3, 5}, 35}, 123 {{3, 7}, 37}, 124 {{5, 0}, 50}, 125 {{5, 2}, 52}, 126 {{5, 3}, 53}, 127 {{6, 0}, 60}, 128 {{6, 1}, 61}, 129 {{6, 2}, 62}, 130 // TODO: Change this to 70 once LLVM NVPTX supports it 131 {{7, 0}, 60}}); 132 int sm_version = 30; 133 auto it = m->find(compute_capability); 134 if (it != m->end()) { 135 sm_version = it->second; 136 } else { 137 LOG(WARNING) << "Unknown compute capability (" << compute_capability.first 138 << ", " << compute_capability.second << ") ." 139 << "Defaulting to telling LLVM that we're compiling for sm_" 140 << sm_version; 141 } 142 return tensorflow::strings::StrCat("sm_", sm_version); 143 } 144 145 // Convenience function for producing a name of a temporary compilation product 146 // from the input filename. 147 string MakeNameForTempProduct(const std::string& input_filename, 148 tensorflow::StringPiece extension) { 149 return ReplaceFilenameExtension( 150 tensorflow::io::Basename(llvm_ir::AsString(input_filename)), extension); 151 } 152 153 // Initializes LLVM passes. Uses the PassRegistry mechanism. 154 void InitializePasses(llvm::PassRegistry* pass_registry) { 155 llvm::initializeCore(*pass_registry); 156 llvm::initializeCodeGen(*pass_registry); 157 llvm::initializeScalarOpts(*pass_registry); 158 llvm::initializeObjCARCOpts(*pass_registry); 159 llvm::initializeVectorization(*pass_registry); 160 llvm::initializeIPO(*pass_registry); 161 llvm::initializeAnalysis(*pass_registry); 162 llvm::initializeTransformUtils(*pass_registry); 163 llvm::initializeInstCombine(*pass_registry); 164 llvm::initializeInstrumentation(*pass_registry); 165 llvm::initializeTarget(*pass_registry); 166 llvm::initializeCodeGenPreparePass(*pass_registry); 167 } 168 169 // Returns the TargetMachine, given a triple. 170 std::unique_ptr<llvm::TargetMachine> GetTargetMachine( 171 llvm::Triple triple, tensorflow::StringPiece cpu_name, 172 const HloModuleConfig& hlo_module_config) { 173 std::string error; 174 const llvm::Target* target = TargetRegistry::lookupTarget("", triple, error); 175 if (target == nullptr) { 176 LOG(FATAL) << "Unable to find Target for triple '" << triple.str() << "'" 177 << " -- " << error; 178 return nullptr; 179 } 180 181 TargetOptions target_options = InitTargetOptionsFromCodeGenFlags(); 182 llvm_ir::SetTargetOptions( 183 /*fast_math_enabled=*/hlo_module_config.debug_options() 184 .xla_enable_fast_math(), 185 &target_options); 186 187 // Enable FMA synthesis. 188 target_options.AllowFPOpFusion = FPOpFusion::Fast; 189 190 // Set the verbose assembly options. 191 target_options.MCOptions.AsmVerbose = false; 192 193 // The selection of codegen optimization level is copied from function 194 // GetCodeGenOptLevel in //third_party/llvm/llvm/tools/opt/opt.cpp. 195 CodeGenOpt::Level codegen_opt_level; 196 switch (hlo_module_config.debug_options().xla_backend_optimization_level()) { 197 case 1: 198 codegen_opt_level = CodeGenOpt::Less; 199 break; 200 case 2: 201 codegen_opt_level = CodeGenOpt::Default; 202 break; 203 case 3: 204 codegen_opt_level = CodeGenOpt::Aggressive; 205 break; 206 default: 207 codegen_opt_level = CodeGenOpt::None; 208 } 209 return WrapUnique(target->createTargetMachine( 210 triple.str(), llvm_ir::AsStringRef(cpu_name), "+ptx42", target_options, 211 Optional<Reloc::Model>(RelocModel), Optional<CodeModel::Model>(CMModel), 212 codegen_opt_level)); 213 } 214 215 // Adds the standard LLVM optimization passes, based on the speed optimization 216 // level (opt_level) and size optimization level (size_level). Both module 217 // and function-level passes are added, so two pass managers are passed in and 218 // modified by this function. 219 void AddOptimizationPasses(unsigned opt_level, unsigned size_level, 220 llvm::TargetMachine* target_machine, 221 llvm::legacy::PassManagerBase* module_passes, 222 llvm::legacy::FunctionPassManager* function_passes) { 223 PassManagerBuilder builder; 224 builder.OptLevel = opt_level; 225 builder.SizeLevel = size_level; 226 227 if (opt_level > 1) { 228 builder.Inliner = llvm::createFunctionInliningPass(kDefaultInlineThreshold); 229 } else { 230 // Only inline functions marked with "alwaysinline". 231 builder.Inliner = llvm::createAlwaysInlinerLegacyPass(); 232 } 233 234 builder.DisableUnitAtATime = false; 235 builder.DisableUnrollLoops = opt_level == 0; 236 builder.LoopVectorize = opt_level > 0; 237 builder.SLPVectorize = opt_level > 1 && size_level < 2; 238 239 // NVPTX's early-as-possible passes include NVVM reflect. 240 target_machine->adjustPassManager(builder); 241 242 builder.populateFunctionPassManager(*function_passes); 243 builder.populateModulePassManager(*module_passes); 244 } 245 246 // Emits the given module to a bit code file. 247 void EmitBitcodeToFile(const Module& module, tensorflow::StringPiece filename) { 248 std::error_code error_code; 249 llvm::ToolOutputFile outfile(filename.ToString().c_str(), error_code, 250 llvm::sys::fs::F_None); 251 if (error_code) { 252 LOG(FATAL) << "opening bitcode file for writing: " << error_code.message(); 253 } 254 255 llvm::WriteBitcodeToFile(module, outfile.os()); 256 outfile.keep(); 257 } 258 259 // Emits the given module to PTX. target_machine is an initialized TargetMachine 260 // for the NVPTX target. 261 string EmitModuleToPTX(Module* module, llvm::TargetMachine* target_machine) { 262 std::string ptx; // need a std::string instead of a ::string. 263 { 264 llvm::raw_string_ostream stream(ptx); 265 llvm::buffer_ostream pstream(stream); 266 // The extension is stripped by IrDumpingPassManager, so we need to 267 // get creative to add a suffix. 268 string module_id(llvm_ir::AsString(module->getModuleIdentifier())); 269 IrDumpingPassManager codegen_passes( 270 ReplaceFilenameExtension(tensorflow::io::Basename(module_id), 271 "-nvptx.dummy"), 272 "", false); 273 codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass( 274 llvm::Triple(module->getTargetTriple()))); 275 276 target_machine->addPassesToEmitFile(codegen_passes, pstream, 277 llvm::TargetMachine::CGFT_AssemblyFile); 278 codegen_passes.run(*module); 279 } 280 281 return ptx; 282 } 283 284 // LLVM has an extensive flags mechanism of its own, which is only accessible 285 // through the command line. Internal libraries within LLVM register parsers for 286 // flags, with no other way to configure them except pass these flags. 287 // To do this programmatically, we invoke ParseCommandLineOptions manually with 288 // a "fake argv". 289 // Note: setting flags with this method is stateful, since flags are just 290 // static globals within LLVM libraries. 291 void FeedLLVMWithFlags(const std::vector<string>& cl_opts) { 292 std::vector<const char*> fake_argv = {""}; 293 for (const string& cl_opt : cl_opts) { 294 fake_argv.push_back(cl_opt.c_str()); 295 } 296 llvm::cl::ParseCommandLineOptions(fake_argv.size(), &fake_argv[0]); 297 } 298 299 // Returns whether the module could use any libdevice functions. This function 300 // may have false positives -- the module might not use libdevice even if this 301 // function returns true. 302 bool CouldNeedLibdevice(const llvm::Module& module) { 303 for (const llvm::Function& function : module.functions()) { 304 // This is a conservative approximation -- not all such functions are in 305 // libdevice. 306 if (!function.isIntrinsic() && function.isDeclaration()) { 307 return true; 308 } 309 } 310 return false; 311 } 312 313 // Links libdevice into the given module if the module needs libdevice. 314 tensorflow::Status LinkLibdeviceIfNecessary( 315 llvm::Module* module, std::pair<int, int> compute_capability, 316 const string& libdevice_dir_path) { 317 if (!CouldNeedLibdevice(*module)) { 318 return tensorflow::Status::OK(); 319 } 320 321 llvm::Linker linker(*module); 322 string libdevice_path = tensorflow::io::JoinPath( 323 libdevice_dir_path, GetLibdeviceFilename(libdevice_dir_path, 324 compute_capability)); 325 TF_RETURN_IF_ERROR(tensorflow::Env::Default()->FileExists(libdevice_path)); 326 VLOG(1) << "Linking with libdevice from: " << libdevice_path; 327 std::unique_ptr<llvm::Module> libdevice_module = 328 LoadIRModule(libdevice_path, &module->getContext()); 329 if (linker.linkInModule( 330 std::move(libdevice_module), llvm::Linker::Flags::LinkOnlyNeeded, 331 [](Module& M, const StringSet<>& GVS) { 332 internalizeModule(M, [&M, &GVS](const GlobalValue& GV) { 333 return !GV.hasName() || (GVS.count(GV.getName()) == 0); 334 }); 335 })) { 336 return tensorflow::errors::Internal(tensorflow::strings::StrCat( 337 "Error linking libdevice from ", libdevice_path)); 338 } 339 return tensorflow::Status::OK(); 340 } 341 342 StatusOr<string> CompileModuleToPtx(llvm::Module* module, 343 std::pair<int, int> compute_capability, 344 const HloModuleConfig& hlo_module_config, 345 const string& libdevice_dir_path) { 346 // If the module has no functions or globals, there's nothing to compile. Just 347 // return an empty string. 348 if (module->empty() && module->global_empty()) { 349 VLOG(2) << "Module '" << llvm_ir::AsString(module->getName()) 350 << "' is empty. Skipping compilation."; 351 return string(); 352 } 353 // Link the input module with libdevice, to pull in implementations of some 354 // builtins. 355 TF_RETURN_IF_ERROR( 356 LinkLibdeviceIfNecessary(module, compute_capability, libdevice_dir_path)); 357 358 // Set the flush-denormals-to-zero flag on the module so the NVVM reflect pass 359 // can access it. 360 module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", 361 hlo_module_config.debug_options().xla_gpu_ftz()); 362 363 // If ftz is enabled, set it as an attribute on every function in the module. 364 if (hlo_module_config.debug_options().xla_gpu_ftz()) { 365 for (llvm::Function& fn : *module) { 366 fn.addFnAttr("nvptx-f32ftz", "true"); 367 } 368 } 369 370 IrDumpingPassManager module_passes(module->getModuleIdentifier(), "", false); 371 372 // Add an appropriate TargetLibraryInfo pass for the module's triple. 373 llvm::TargetLibraryInfoWrapperPass* tliwp = 374 new llvm::TargetLibraryInfoWrapperPass( 375 llvm::Triple(module->getTargetTriple())); 376 module_passes.add(tliwp); 377 378 // Try to fetch the target triple from the module. If not present, set a 379 // default target triple. 380 llvm::Triple target_triple = llvm::Triple(module->getTargetTriple()); 381 if (target_triple.getArch() == llvm::Triple::UnknownArch) { 382 LOG(WARNING) << "target triple not found in the module"; 383 target_triple = llvm::Triple("nvptx64-unknown-unknown"); 384 } 385 386 // Figure out the exact name of the processor as known to the NVPTX backend 387 // from the gpu_architecture flag. 388 std::unique_ptr<llvm::TargetMachine> target_machine = GetTargetMachine( 389 target_triple, GetSmName(compute_capability), hlo_module_config); 390 module_passes.add(llvm::createTargetTransformInfoWrapperPass( 391 target_machine->getTargetIRAnalysis())); 392 393 // The LLVM IR verifier performs sanity checking on the IR. This helps 394 // discover problems and report them in a meaningful manner, rather than let 395 // later passes report obscure assertions because of unfulfilled invariants. 396 module_passes.add(llvm::createVerifierPass()); 397 398 // Create the function-level pass manager. It needs data layout information 399 // too. 400 llvm::legacy::FunctionPassManager function_passes(module); 401 402 int32 opt_level = 403 hlo_module_config.debug_options().xla_backend_optimization_level(); 404 405 CHECK_GE(opt_level, 2) 406 << "The XLA GPU backend doesn't support unoptimized code generation"; 407 408 AddOptimizationPasses(opt_level, 409 /*size_level=*/0, target_machine.get(), &module_passes, 410 &function_passes); 411 412 // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA 413 // again after the standard optimization passes [http://b/13329423]. 414 // TODO(jingyue): SROA may further expose more optimization opportunities such 415 // as more precise alias analysis and more function inlining (SROA may change 416 // the inlining cost of a function). For now, running SROA already emits good 417 // enough code for the evaluated benchmarks. We may want to run more 418 // optimizations later. 419 if (opt_level > 0) { 420 // LLVM's optimizer turns on SROA when the optimization level is greater 421 // than 0. We mimic this behavior here. 422 module_passes.add(llvm::createSROAPass()); 423 } 424 425 // Verify that the module is well formed after optimizations ran. 426 module_passes.add(llvm::createVerifierPass()); 427 428 // Done populating the pass managers. Now run them. 429 430 function_passes.doInitialization(); 431 for (auto func = module->begin(); func != module->end(); ++func) { 432 function_passes.run(*func); 433 } 434 function_passes.doFinalization(); 435 module_passes.run(*module); 436 437 // Finally, produce PTX. 438 return EmitModuleToPTX(module, target_machine.get()); 439 } 440 441 // One-time module initializer. 442 // Must be called only once -- DO NOT CALL DIRECTLY. 443 void GPUBackendInit(const HloModuleConfig& hlo_module_config) { 444 // Feed all customized flags here, so we can override them with llvm_cl_opts 445 // without redeploy the compiler for development purpose. 446 447 // This flag tunes a threshold in branch folding. The default threshold, which 448 // is one, is not suitable for CUDA programs where branches are more expensive 449 // than for CPU programs. Setting the threshold to 2 improves the latency of 450 // TwoDPatchDotProductKernel_IND_3_ND_48 by over 5%, and does not affect the 451 // latency of other benchmarks so far. 452 // 453 // I also tried setting this threshold to other values: 454 // * 3-6 gives similar results as 2; 455 // * >6 start hurting the performance of at least dot product kernels. 456 // 457 // TODO(jingyue): The current threshold only considers the numbr of IR 458 // instructions which do not accurately reflect the true cost. We need a 459 // better cost model. 460 FeedLLVMWithFlags({"-bonus-inst-threshold=2"}); 461 // TODO(b/22073864): Increase limit when scan memory dependency. 462 // This helps to reduce more redundant load instructions. 463 // 464 // The specific value is currently large enough for s3d in shoc benchmark, 465 // which contains a lot of load instructions and many arithmetic instructions 466 // between those loads. 467 FeedLLVMWithFlags({"-memdep-block-scan-limit=500"}); 468 469 llvm_ir::InitializeLLVMCommandLineOptions(hlo_module_config); 470 471 // Initialize the NVPTX target; it's the only target we link with, so call its 472 // specific initialization functions instead of the catch-all InitializeAll*. 473 LLVMInitializeNVPTXTarget(); 474 LLVMInitializeNVPTXTargetInfo(); 475 LLVMInitializeNVPTXTargetMC(); 476 LLVMInitializeNVPTXAsmPrinter(); 477 478 // Initialize the LLVM optimization passes. 479 llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry(); 480 InitializePasses(registry); 481 } 482 483 } // namespace 484 485 StatusOr<string> CompileToPtx(llvm::Module* module, 486 std::pair<int, int> compute_capability, 487 const HloModuleConfig& hlo_module_config, 488 const string& libdevice_dir_path) { 489 static std::once_flag backend_init_flag; 490 std::call_once(backend_init_flag, GPUBackendInit, hlo_module_config); 491 492 string ptx; 493 { 494 tensorflow::port::Tracing::TraceMe annotation( 495 "Compiling IR", llvm_ir::AsString(module->getName()), 496 /*is_expensive=*/true); 497 XLA_SCOPED_LOGGING_TIMER("Compile module " + 498 llvm_ir::AsString(module->getName())); 499 TF_ASSIGN_OR_RETURN( 500 ptx, CompileModuleToPtx(module, compute_capability, hlo_module_config, 501 libdevice_dir_path)); 502 } 503 return ptx; 504 } 505 506 } // namespace gpu 507 } // namespace xla 508