Home | History | Annotate | Download | only in llvm_gpu_backend
      1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
     17 
     18 #include <map>
     19 #include <memory>
     20 #include <string>
     21 #include <utility>
     22 
     23 #include "tensorflow/compiler/xla/ptr_util.h"
     24 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h"
     25 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h"
     26 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
     27 #include "tensorflow/compiler/xla/status_macros.h"
     28 #include "tensorflow/compiler/xla/util.h"
     29 
     30 #include "llvm/ADT/STLExtras.h"
     31 #include "llvm/ADT/StringMap.h"
     32 #include "llvm/ADT/StringSet.h"
     33 #include "llvm/Analysis/TargetLibraryInfo.h"
     34 #include "llvm/Analysis/TargetTransformInfo.h"
     35 #include "llvm/Bitcode/BitcodeReader.h"
     36 #include "llvm/Bitcode/BitcodeWriter.h"
     37 #include "llvm/CodeGen/CommandFlags.def"
     38 #include "llvm/IR/LLVMContext.h"
     39 #include "llvm/IR/LegacyPassManager.h"
     40 #include "llvm/IR/Module.h"
     41 #include "llvm/IR/Verifier.h"
     42 #include "llvm/Linker/Linker.h"
     43 #include "llvm/PassRegistry.h"
     44 #include "llvm/Support/CommandLine.h"
     45 #include "llvm/Support/FileSystem.h"
     46 #include "llvm/Support/FormattedStream.h"
     47 #include "llvm/Support/TargetRegistry.h"
     48 #include "llvm/Support/TargetSelect.h"
     49 #include "llvm/Support/ToolOutputFile.h"
     50 #include "llvm/Target/TargetMachine.h"
     51 #include "llvm/Transforms/IPO.h"
     52 #include "llvm/Transforms/IPO/AlwaysInliner.h"
     53 #include "llvm/Transforms/IPO/Internalize.h"
     54 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
     55 #include "llvm/Transforms/Scalar.h"
     56 #include "tensorflow/compiler/xla/types.h"
     57 #include "tensorflow/core/lib/core/stringpiece.h"
     58 #include "tensorflow/core/lib/io/path.h"
     59 #include "tensorflow/core/lib/strings/str_util.h"
     60 #include "tensorflow/core/lib/strings/stringprintf.h"
     61 #include "tensorflow/core/platform/env.h"
     62 #include "tensorflow/core/platform/logging.h"
     63 #include "tensorflow/core/platform/tracing.h"
     64 
     65 namespace xla {
     66 namespace gpu {
     67 namespace {
     68 
     69 // Default inline threshold value to use in llvm.
     70 const int kDefaultInlineThreshold = 1100;
     71 
     72 // Gets the libdevice filename for a particular compute capability.  When
     73 // presented with a GPU we don't recognize, we just return the libdevice from
     74 // compute_20.
     75 static string GetLibdeviceFilename(const string& libdevice_dir_path,
     76                                    std::pair<int, int> compute_capability) {
     77   // Since CUDA 9.0, all GPU versions are included in a single file
     78   const char* unified_libdevice_filename = "libdevice.10.bc";
     79   std::vector<string> unified_libdevice_files;
     80   const tensorflow::Status status =
     81     tensorflow::Env::Default()->GetMatchingPaths(
     82       tensorflow::io::JoinPath(libdevice_dir_path, unified_libdevice_filename),
     83       &unified_libdevice_files);
     84   if (status.ok() && unified_libdevice_files.size() == 1) {
     85     return unified_libdevice_filename;
     86   }
     87   // There are only four libdevice files: compute_{20,30,35,50}.  Each GPU
     88   // version gets mapped to one of these.  Note in particular that sm_60 and
     89   // sm_61 map to libdevice.compute_30.
     90   static auto* m = new std::map<std::pair<int, int>, int>({{{2, 0}, 20},
     91                                                            {{2, 1}, 20},
     92                                                            {{3, 0}, 30},
     93                                                            {{3, 2}, 30},
     94                                                            {{3, 5}, 35},
     95                                                            {{3, 7}, 35},
     96                                                            {{5, 0}, 50},
     97                                                            {{5, 2}, 50},
     98                                                            {{5, 3}, 50},
     99                                                            {{6, 0}, 30},
    100                                                            {{6, 1}, 30},
    101                                                            {{6, 2}, 30}});
    102   int libdevice_version = 20;
    103   auto it = m->find(compute_capability);
    104   if (it != m->end()) {
    105     libdevice_version = it->second;
    106   } else {
    107     LOG(WARNING) << "Unknown compute capability (" << compute_capability.first
    108                  << ", " << compute_capability.second << ") ."
    109                  << "Defaulting to libdevice for compute_" << libdevice_version;
    110   }
    111   return tensorflow::strings::StrCat("libdevice.compute_", libdevice_version,
    112                                      ".10.bc");
    113 }
    114 
    115 // Gets the GPU name as it's known to LLVM for a given compute capability.  If
    116 // we see an unrecognized compute capability, we return "sm_30".
    117 static string GetSmName(std::pair<int, int> compute_capability) {
    118   static auto* m = new std::map<std::pair<int, int>, int>({{{2, 0}, 20},
    119                                                            {{2, 1}, 21},
    120                                                            {{3, 0}, 30},
    121                                                            {{3, 2}, 32},
    122                                                            {{3, 5}, 35},
    123                                                            {{3, 7}, 37},
    124                                                            {{5, 0}, 50},
    125                                                            {{5, 2}, 52},
    126                                                            {{5, 3}, 53},
    127                                                            {{6, 0}, 60},
    128                                                            {{6, 1}, 61},
    129                                                            {{6, 2}, 62},
    130                     // TODO: Change this to 70 once LLVM NVPTX supports it
    131                                                            {{7, 0}, 60}});
    132   int sm_version = 30;
    133   auto it = m->find(compute_capability);
    134   if (it != m->end()) {
    135     sm_version = it->second;
    136   } else {
    137     LOG(WARNING) << "Unknown compute capability (" << compute_capability.first
    138                  << ", " << compute_capability.second << ") ."
    139                  << "Defaulting to telling LLVM that we're compiling for sm_"
    140                  << sm_version;
    141   }
    142   return tensorflow::strings::StrCat("sm_", sm_version);
    143 }
    144 
    145 // Convenience function for producing a name of a temporary compilation product
    146 // from the input filename.
    147 string MakeNameForTempProduct(const std::string& input_filename,
    148                               tensorflow::StringPiece extension) {
    149   return ReplaceFilenameExtension(
    150       tensorflow::io::Basename(llvm_ir::AsString(input_filename)), extension);
    151 }
    152 
    153 // Initializes LLVM passes. Uses the PassRegistry mechanism.
    154 void InitializePasses(llvm::PassRegistry* pass_registry) {
    155   llvm::initializeCore(*pass_registry);
    156   llvm::initializeCodeGen(*pass_registry);
    157   llvm::initializeScalarOpts(*pass_registry);
    158   llvm::initializeObjCARCOpts(*pass_registry);
    159   llvm::initializeVectorization(*pass_registry);
    160   llvm::initializeIPO(*pass_registry);
    161   llvm::initializeAnalysis(*pass_registry);
    162   llvm::initializeTransformUtils(*pass_registry);
    163   llvm::initializeInstCombine(*pass_registry);
    164   llvm::initializeInstrumentation(*pass_registry);
    165   llvm::initializeTarget(*pass_registry);
    166   llvm::initializeCodeGenPreparePass(*pass_registry);
    167 }
    168 
    169 // Returns the TargetMachine, given a triple.
    170 std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
    171     llvm::Triple triple, tensorflow::StringPiece cpu_name,
    172     const HloModuleConfig& hlo_module_config) {
    173   std::string error;
    174   const llvm::Target* target = TargetRegistry::lookupTarget("", triple, error);
    175   if (target == nullptr) {
    176     LOG(FATAL) << "Unable to find Target for triple '" << triple.str() << "'"
    177                << " -- " << error;
    178     return nullptr;
    179   }
    180 
    181   TargetOptions target_options = InitTargetOptionsFromCodeGenFlags();
    182   llvm_ir::SetTargetOptions(
    183       /*fast_math_enabled=*/hlo_module_config.debug_options()
    184           .xla_enable_fast_math(),
    185       &target_options);
    186 
    187   // Enable FMA synthesis.
    188   target_options.AllowFPOpFusion = FPOpFusion::Fast;
    189 
    190   // Set the verbose assembly options.
    191   target_options.MCOptions.AsmVerbose = false;
    192 
    193   // The selection of codegen optimization level is copied from function
    194   // GetCodeGenOptLevel in //third_party/llvm/llvm/tools/opt/opt.cpp.
    195   CodeGenOpt::Level codegen_opt_level;
    196   switch (hlo_module_config.debug_options().xla_backend_optimization_level()) {
    197     case 1:
    198       codegen_opt_level = CodeGenOpt::Less;
    199       break;
    200     case 2:
    201       codegen_opt_level = CodeGenOpt::Default;
    202       break;
    203     case 3:
    204       codegen_opt_level = CodeGenOpt::Aggressive;
    205       break;
    206     default:
    207       codegen_opt_level = CodeGenOpt::None;
    208   }
    209   return WrapUnique(target->createTargetMachine(
    210       triple.str(), llvm_ir::AsStringRef(cpu_name), "+ptx42", target_options,
    211       Optional<Reloc::Model>(RelocModel), Optional<CodeModel::Model>(CMModel),
    212       codegen_opt_level));
    213 }
    214 
    215 // Adds the standard LLVM optimization passes, based on the speed optimization
    216 // level (opt_level) and size optimization level (size_level). Both module
    217 // and function-level passes are added, so two pass managers are passed in and
    218 // modified by this function.
    219 void AddOptimizationPasses(unsigned opt_level, unsigned size_level,
    220                            llvm::TargetMachine* target_machine,
    221                            llvm::legacy::PassManagerBase* module_passes,
    222                            llvm::legacy::FunctionPassManager* function_passes) {
    223   PassManagerBuilder builder;
    224   builder.OptLevel = opt_level;
    225   builder.SizeLevel = size_level;
    226 
    227   if (opt_level > 1) {
    228     builder.Inliner = llvm::createFunctionInliningPass(kDefaultInlineThreshold);
    229   } else {
    230     // Only inline functions marked with "alwaysinline".
    231     builder.Inliner = llvm::createAlwaysInlinerLegacyPass();
    232   }
    233 
    234   builder.DisableUnitAtATime = false;
    235   builder.DisableUnrollLoops = opt_level == 0;
    236   builder.LoopVectorize = opt_level > 0;
    237   builder.SLPVectorize = opt_level > 1 && size_level < 2;
    238 
    239   // NVPTX's early-as-possible passes include NVVM reflect.
    240   target_machine->adjustPassManager(builder);
    241 
    242   builder.populateFunctionPassManager(*function_passes);
    243   builder.populateModulePassManager(*module_passes);
    244 }
    245 
    246 // Emits the given module to a bit code file.
    247 void EmitBitcodeToFile(const Module& module, tensorflow::StringPiece filename) {
    248   std::error_code error_code;
    249   llvm::ToolOutputFile outfile(filename.ToString().c_str(), error_code,
    250                                llvm::sys::fs::F_None);
    251   if (error_code) {
    252     LOG(FATAL) << "opening bitcode file for writing: " << error_code.message();
    253   }
    254 
    255   llvm::WriteBitcodeToFile(module, outfile.os());
    256   outfile.keep();
    257 }
    258 
    259 // Emits the given module to PTX. target_machine is an initialized TargetMachine
    260 // for the NVPTX target.
    261 string EmitModuleToPTX(Module* module, llvm::TargetMachine* target_machine) {
    262   std::string ptx;  // need a std::string instead of a ::string.
    263   {
    264     llvm::raw_string_ostream stream(ptx);
    265     llvm::buffer_ostream pstream(stream);
    266     // The extension is stripped by IrDumpingPassManager, so we need to
    267     // get creative to add a suffix.
    268     string module_id(llvm_ir::AsString(module->getModuleIdentifier()));
    269     IrDumpingPassManager codegen_passes(
    270         ReplaceFilenameExtension(tensorflow::io::Basename(module_id),
    271                                  "-nvptx.dummy"),
    272         "", false);
    273     codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
    274         llvm::Triple(module->getTargetTriple())));
    275 
    276     target_machine->addPassesToEmitFile(codegen_passes, pstream,
    277                                         llvm::TargetMachine::CGFT_AssemblyFile);
    278     codegen_passes.run(*module);
    279   }
    280 
    281   return ptx;
    282 }
    283 
    284 // LLVM has an extensive flags mechanism of its own, which is only accessible
    285 // through the command line. Internal libraries within LLVM register parsers for
    286 // flags, with no other way to configure them except pass these flags.
    287 // To do this programmatically, we invoke ParseCommandLineOptions manually with
    288 // a "fake argv".
    289 // Note: setting flags with this method is stateful, since flags are just
    290 // static globals within LLVM libraries.
    291 void FeedLLVMWithFlags(const std::vector<string>& cl_opts) {
    292   std::vector<const char*> fake_argv = {""};
    293   for (const string& cl_opt : cl_opts) {
    294     fake_argv.push_back(cl_opt.c_str());
    295   }
    296   llvm::cl::ParseCommandLineOptions(fake_argv.size(), &fake_argv[0]);
    297 }
    298 
    299 // Returns whether the module could use any libdevice functions. This function
    300 // may have false positives -- the module might not use libdevice even if this
    301 // function returns true.
    302 bool CouldNeedLibdevice(const llvm::Module& module) {
    303   for (const llvm::Function& function : module.functions()) {
    304     // This is a conservative approximation -- not all such functions are in
    305     // libdevice.
    306     if (!function.isIntrinsic() && function.isDeclaration()) {
    307       return true;
    308     }
    309   }
    310   return false;
    311 }
    312 
    313 // Links libdevice into the given module if the module needs libdevice.
    314 tensorflow::Status LinkLibdeviceIfNecessary(
    315     llvm::Module* module, std::pair<int, int> compute_capability,
    316     const string& libdevice_dir_path) {
    317   if (!CouldNeedLibdevice(*module)) {
    318     return tensorflow::Status::OK();
    319   }
    320 
    321   llvm::Linker linker(*module);
    322   string libdevice_path = tensorflow::io::JoinPath(
    323       libdevice_dir_path, GetLibdeviceFilename(libdevice_dir_path,
    324                                                compute_capability));
    325   TF_RETURN_IF_ERROR(tensorflow::Env::Default()->FileExists(libdevice_path));
    326   VLOG(1) << "Linking with libdevice from: " << libdevice_path;
    327   std::unique_ptr<llvm::Module> libdevice_module =
    328       LoadIRModule(libdevice_path, &module->getContext());
    329   if (linker.linkInModule(
    330           std::move(libdevice_module), llvm::Linker::Flags::LinkOnlyNeeded,
    331           [](Module& M, const StringSet<>& GVS) {
    332             internalizeModule(M, [&M, &GVS](const GlobalValue& GV) {
    333               return !GV.hasName() || (GVS.count(GV.getName()) == 0);
    334             });
    335           })) {
    336     return tensorflow::errors::Internal(tensorflow::strings::StrCat(
    337         "Error linking libdevice from ", libdevice_path));
    338   }
    339   return tensorflow::Status::OK();
    340 }
    341 
    342 StatusOr<string> CompileModuleToPtx(llvm::Module* module,
    343                                     std::pair<int, int> compute_capability,
    344                                     const HloModuleConfig& hlo_module_config,
    345                                     const string& libdevice_dir_path) {
    346   // If the module has no functions or globals, there's nothing to compile. Just
    347   // return an empty string.
    348   if (module->empty() && module->global_empty()) {
    349     VLOG(2) << "Module '" << llvm_ir::AsString(module->getName())
    350             << "' is empty. Skipping compilation.";
    351     return string();
    352   }
    353   // Link the input module with libdevice, to pull in implementations of some
    354   // builtins.
    355   TF_RETURN_IF_ERROR(
    356       LinkLibdeviceIfNecessary(module, compute_capability, libdevice_dir_path));
    357 
    358   // Set the flush-denormals-to-zero flag on the module so the NVVM reflect pass
    359   // can access it.
    360   module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
    361                         hlo_module_config.debug_options().xla_gpu_ftz());
    362 
    363   // If ftz is enabled, set it as an attribute on every function in the module.
    364   if (hlo_module_config.debug_options().xla_gpu_ftz()) {
    365     for (llvm::Function& fn : *module) {
    366       fn.addFnAttr("nvptx-f32ftz", "true");
    367     }
    368   }
    369 
    370   IrDumpingPassManager module_passes(module->getModuleIdentifier(), "", false);
    371 
    372   // Add an appropriate TargetLibraryInfo pass for the module's triple.
    373   llvm::TargetLibraryInfoWrapperPass* tliwp =
    374       new llvm::TargetLibraryInfoWrapperPass(
    375           llvm::Triple(module->getTargetTriple()));
    376   module_passes.add(tliwp);
    377 
    378   // Try to fetch the target triple from the module. If not present, set a
    379   // default target triple.
    380   llvm::Triple target_triple = llvm::Triple(module->getTargetTriple());
    381   if (target_triple.getArch() == llvm::Triple::UnknownArch) {
    382     LOG(WARNING) << "target triple not found in the module";
    383     target_triple = llvm::Triple("nvptx64-unknown-unknown");
    384   }
    385 
    386   // Figure out the exact name of the processor as known to the NVPTX backend
    387   // from the gpu_architecture flag.
    388   std::unique_ptr<llvm::TargetMachine> target_machine = GetTargetMachine(
    389       target_triple, GetSmName(compute_capability), hlo_module_config);
    390   module_passes.add(llvm::createTargetTransformInfoWrapperPass(
    391       target_machine->getTargetIRAnalysis()));
    392 
    393   // The LLVM IR verifier performs sanity checking on the IR. This helps
    394   // discover problems and report them in a meaningful manner, rather than let
    395   // later passes report obscure assertions because of unfulfilled invariants.
    396   module_passes.add(llvm::createVerifierPass());
    397 
    398   // Create the function-level pass manager. It needs data layout information
    399   // too.
    400   llvm::legacy::FunctionPassManager function_passes(module);
    401 
    402   int32 opt_level =
    403       hlo_module_config.debug_options().xla_backend_optimization_level();
    404 
    405   CHECK_GE(opt_level, 2)
    406       << "The XLA GPU backend doesn't support unoptimized code generation";
    407 
    408   AddOptimizationPasses(opt_level,
    409                         /*size_level=*/0, target_machine.get(), &module_passes,
    410                         &function_passes);
    411 
    412   // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA
    413   // again after the standard optimization passes [http://b/13329423].
    414   // TODO(jingyue): SROA may further expose more optimization opportunities such
    415   // as more precise alias analysis and more function inlining (SROA may change
    416   // the inlining cost of a function). For now, running SROA already emits good
    417   // enough code for the evaluated benchmarks. We may want to run more
    418   // optimizations later.
    419   if (opt_level > 0) {
    420     // LLVM's optimizer turns on SROA when the optimization level is greater
    421     // than 0. We mimic this behavior here.
    422     module_passes.add(llvm::createSROAPass());
    423   }
    424 
    425   // Verify that the module is well formed after optimizations ran.
    426   module_passes.add(llvm::createVerifierPass());
    427 
    428   // Done populating the pass managers. Now run them.
    429 
    430   function_passes.doInitialization();
    431   for (auto func = module->begin(); func != module->end(); ++func) {
    432     function_passes.run(*func);
    433   }
    434   function_passes.doFinalization();
    435   module_passes.run(*module);
    436 
    437   // Finally, produce PTX.
    438   return EmitModuleToPTX(module, target_machine.get());
    439 }
    440 
    441 // One-time module initializer.
    442 // Must be called only once -- DO NOT CALL DIRECTLY.
    443 void GPUBackendInit(const HloModuleConfig& hlo_module_config) {
    444   // Feed all customized flags here, so we can override them with llvm_cl_opts
    445   // without redeploy the compiler for development purpose.
    446 
    447   // This flag tunes a threshold in branch folding. The default threshold, which
    448   // is one, is not suitable for CUDA programs where branches are more expensive
    449   // than for CPU programs. Setting the threshold to 2 improves the latency of
    450   // TwoDPatchDotProductKernel_IND_3_ND_48 by over 5%, and does not affect the
    451   // latency of other benchmarks so far.
    452   //
    453   // I also tried setting this threshold to other values:
    454   // * 3-6 gives similar results as 2;
    455   // * >6 start hurting the performance of at least dot product kernels.
    456   //
    457   // TODO(jingyue): The current threshold only considers the numbr of IR
    458   // instructions which do not accurately reflect the true cost. We need a
    459   // better cost model.
    460   FeedLLVMWithFlags({"-bonus-inst-threshold=2"});
    461   // TODO(b/22073864): Increase limit when scan memory dependency.
    462   // This helps to reduce more redundant load instructions.
    463   //
    464   // The specific value is currently large enough for s3d in shoc benchmark,
    465   // which contains a lot of load instructions and many arithmetic instructions
    466   // between those loads.
    467   FeedLLVMWithFlags({"-memdep-block-scan-limit=500"});
    468 
    469   llvm_ir::InitializeLLVMCommandLineOptions(hlo_module_config);
    470 
    471   // Initialize the NVPTX target; it's the only target we link with, so call its
    472   // specific initialization functions instead of the catch-all InitializeAll*.
    473   LLVMInitializeNVPTXTarget();
    474   LLVMInitializeNVPTXTargetInfo();
    475   LLVMInitializeNVPTXTargetMC();
    476   LLVMInitializeNVPTXAsmPrinter();
    477 
    478   // Initialize the LLVM optimization passes.
    479   llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry();
    480   InitializePasses(registry);
    481 }
    482 
    483 }  // namespace
    484 
    485 StatusOr<string> CompileToPtx(llvm::Module* module,
    486                               std::pair<int, int> compute_capability,
    487                               const HloModuleConfig& hlo_module_config,
    488                               const string& libdevice_dir_path) {
    489   static std::once_flag backend_init_flag;
    490   std::call_once(backend_init_flag, GPUBackendInit, hlo_module_config);
    491 
    492   string ptx;
    493   {
    494     tensorflow::port::Tracing::TraceMe annotation(
    495         "Compiling IR", llvm_ir::AsString(module->getName()),
    496         /*is_expensive=*/true);
    497     XLA_SCOPED_LOGGING_TIMER("Compile module " +
    498                              llvm_ir::AsString(module->getName()));
    499     TF_ASSIGN_OR_RETURN(
    500         ptx, CompileModuleToPtx(module, compute_capability, hlo_module_config,
    501                                 libdevice_dir_path));
    502   }
    503   return ptx;
    504 }
    505 
    506 }  // namespace gpu
    507 }  // namespace xla
    508