Home | History | Annotate | Download | only in stream_executor
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      7     http://www.apache.org/licenses/LICENSE-2.0
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     16 // Kernel-loader specs are structures that describe how to load a data-parallel
     17 // kernel on a given platform for subsequent launching. Headers that instantiate
     18 // these data structures will typically be auto-generated. However, users can
     19 // also instantiate them by hand.
     20 //
     21 // A kernel with the same exact functionality and type signature may be
     22 // implemented on several different platforms. Typical usage is to create a
     23 // singleton that describes how to load a kernel on the various supported
     24 // platforms:
     25 //
     26 //  static const MultiKernelLoaderSpec &SaxpySpec() {
     27 //    static auto *mkls =
     28 //        (new MultiKernelLoaderSpec{4 /* = arity */})
     29 //            ->AddCudaPtxOnDisk(ptx_file_path, ptx_kernelname)
     30 //            ->AddOpenCLTextOnDisk(opencl_text_file_path, ocl_kernelname);
     31 //    };
     32 //
     33 //    return *mkls;
     34 //  }
     35 //
     36 // This lazily instantiates an object that describes how to load CUDA PTX
     37 // present on disk that implements saxpy for the for the CUDA platform, or
     38 // OpenCL text present on disk that implements saxpy for an OpenCL-based
     39 // platform. The CudaPtxOnDisk and OpenCLTextOnDisk objects are subtypes of
     40 // KernelLoaderSpec -- KernelLoaderSpec describes how to load a kernel for
     41 // subsequent launching on a single platform.
     42 //
     43 // For the loader functionality that accepts these KernelLoaderSpecs in order
     44 // to grab the kernel appropriately, see StreamExecutor::GetKernel().
     49 #include <stddef.h>
     50 #include <map>
     51 #include <memory>
     52 #include "tensorflow/stream_executor/platform/port.h"
     54 #include "tensorflow/stream_executor/lib/stringpiece.h"
     55 #include "tensorflow/stream_executor/platform/logging.h"
     56 #include "tensorflow/stream_executor/platform/mutex.h"
     57 #include "tensorflow/stream_executor/platform/port.h"
     59 namespace perftools {
     60 namespace gputools {
     62 // Describes how to load a kernel on a target platform.
     63 //
     64 // This is an abstract base class, subclassed for specific platforms.
     65 // The filename_or_text field represents the program location (i.e. PTX or
     66 // OpenCL loadable translation unit path) and is simply stored; whether it is a
     67 // filename or text is exposed via more specifically named accessors in
     68 // subclasses.
     69 //
     70 // These kernel loader specifications are typically auto-generated into header
     71 // files at build time, but can also be specified manually.
     72 class KernelLoaderSpec {
     73  public:
     74   virtual ~KernelLoaderSpec() {}
     76   // Returns the kernel name to load out of the program.
     77   const string &kernelname() const { return kernelname_; }
     79  protected:
     80   explicit KernelLoaderSpec(port::StringPiece kernelname);
     82  private:
     83   // The kernel name that should be loaded out of the program description given
     84   // above.
     85   string kernelname_;
     87   SE_DISALLOW_COPY_AND_ASSIGN(KernelLoaderSpec);
     88 };
     90 // An abstract kernel loader spec that has an associated file path, where
     91 // there's a canonical suffix for the filename; e.g. see CudaPtxOnDisk whose
     92 // canonical filename suffix is ".ptx".
     93 class OnDiskKernelLoaderSpec : public KernelLoaderSpec {
     94  public:
     95   ~OnDiskKernelLoaderSpec() override {}
     97   // Returns the path to the on-disk loadable kernel file.
     98   const string &filename() const { return filename_; }
    100   // Returns the canonical suffix for this on-disk kernel loader spec format;
    101   // e.g. PTX files on disk have a canonical suffix of ".ptx".
    102   virtual const char *CanonicalSuffix() const = 0;
    104  protected:
    105   OnDiskKernelLoaderSpec(port::StringPiece filename,
    106                          port::StringPiece kernelname);
    108   string filename_;
    110  private:
    111   SE_DISALLOW_COPY_AND_ASSIGN(OnDiskKernelLoaderSpec);
    112 };
    114 // Kernel loader specification for PTX text that resides on disk.
    115 class CudaPtxOnDisk : public OnDiskKernelLoaderSpec {
    116  public:
    117   CudaPtxOnDisk(port::StringPiece filename, port::StringPiece kernelname);
    118   ~CudaPtxOnDisk() override {}
    120   const char *CanonicalSuffix() const override { return ".ptx"; }
    122  private:
    124 };
    126 // Kernel loader specification for CUBIN binary that resides on disk.
    127 class CudaCubinOnDisk : public OnDiskKernelLoaderSpec {
    128  public:
    129   CudaCubinOnDisk(port::StringPiece filename, port::StringPiece kernelname);
    130   ~CudaCubinOnDisk() override {}
    132   const string &filename() const { return filename_; }
    134   const char *CanonicalSuffix() const override { return ".cubin"; }
    136  private:
    137   string filename_;
    139   SE_DISALLOW_COPY_AND_ASSIGN(CudaCubinOnDisk);
    140 };
    142 // Kernel loader specification for PTX text that resides in memory.
    143 class CudaPtxInMemory : public KernelLoaderSpec {
    144  public:
    145   // Components: compute capability major number, compute capability minor
    146   // number, and PTX source.
    147   typedef std::tuple<int, int, port::StringPiece> PtxSpec;
    149   // Single-PTX constructor. Adds the provided PTX version with an unknown
    150   // compute capability. Since the CC is unknown, the PTX is assumed to be very
    151   // generally usable - in other words, PTX specified in this manner is VERY
    152   // likely to be used as the default! Note that the PTX can be compressed,
    153   // which is indicated by the argument ptx_compressed.
    154   //
    155   // Warning: the string backing the provided port::StringPiece ptx must outlive this
    156   // instance.
    157   CudaPtxInMemory(port::StringPiece ptx, port::StringPiece kernelname,
    158                   bool ptx_compressed = false);
    160   // Multiple-PTX-version constructor. Adds each item in spec_list to this
    161   // object. Note that the PTX can be compressed, which is indicated by the
    162   // argument ptx_compressed.
    163   CudaPtxInMemory(const std::initializer_list<PtxSpec> &spec_list,
    164                   port::StringPiece kernel_name, bool ptx_compressed = false);
    165   ~CudaPtxInMemory() override {}
    167   // Add the PTX implementation described by ptx_spec to this object. On
    168   // collision (i.e., if a version with the same compute_capability already
    169   // exists), the existing implementation will be overwritten.
    170   void AddSpec(PtxSpec ptx_spec);
    172   // Returns pointer to the ptx of available implementation with the
    173   // lowest-valued compute capability. For example, if PTX written to CC2.0,
    174   // 3.0, and 3.5 are all available, the version for CC2.0 will be set. Returns
    175   // nullptr on failed lookup (if any version is not available).
    176   // When the ptx is compressed, returns the decompressed ptx.
    177   const char *default_text() const;
    179   // Similar to default_text().
    180   // When the ptx is compressed, returns the decompressed ptx.
    181   const char *original_default_text() const;
    183   // Returns pointer to the ptx for the requested compute capability.
    184   // Returns nullptr on failed lookup (if the requested version is not
    185   // available).
    186   // When the ptx is compressed, returns the decompressed ptx.
    187   const char *text(int compute_capability_major,
    188                    int compute_capability_minor) const;
    190   // Similar to text().
    191   // When the ptx is compressed, returns the original compressed ptx.
    192   const char *original_text(int compute_capability_major,
    193                             int compute_capability_minor) const;
    195   // Decompresses the PTX string using bzip2.
    196   static string DecompressPtx(const char *ptx);
    198  private:
    199   // PTX translation unit text contents in memory. The key is of as a tuple
    200   // "<cc_major>,<cc_minor>", i.e., "2,0", "3,0", "3,5". Because CC's
    201   // represented in this way have a clear sorting order, map::begin() will give
    202   // the lowest-numbered version available, i.e. the default.
    203   std::map<std::tuple<int, int>, const char *,
    204            bool (*)(const std::tuple<int, int> &, const std::tuple<int, int> &)>
    205       ptx_by_compute_capability_;
    207   // Stores all decompressed ptx strings, with original ptx string as keys.
    208   // It is marked as mutable for lazy decompression.
    209   mutable std::map<const char *, string> decompressed_ptx_;
    210   mutable mutex mu_;
    212   // Defines the minimum compute capability possible. Used when PTX has no
    213   // compute capability specified (in the single-PTX constructor).
    214   static const std::tuple<int, int> kMinimumCapability;
    216   SE_DISALLOW_COPY_AND_ASSIGN(CudaPtxInMemory);
    217 };
    219 // Kernel loader specification for OpenCL text that resides on disk.
    220 class OpenCLTextOnDisk : public OnDiskKernelLoaderSpec {
    221  public:
    222   OpenCLTextOnDisk(port::StringPiece filename, port::StringPiece kernelname);
    223   ~OpenCLTextOnDisk() override {}
    225   const char *CanonicalSuffix() const override { return ".ocl"; }
    227  private:
    229 };
    231 // Kernel loader specification for OpenCL binary that resides on disk.
    232 class OpenCLBinaryOnDisk : public OnDiskKernelLoaderSpec {
    233  public:
    234   OpenCLBinaryOnDisk(port::StringPiece filename, port::StringPiece kernelname);
    235   ~OpenCLBinaryOnDisk() override {}
    237   const char *CanonicalSuffix() const override { return ".aocx"; }
    239  private:
    240   SE_DISALLOW_COPY_AND_ASSIGN(OpenCLBinaryOnDisk);
    241 };
    243 // Kernel loader specification for OpenCL text that resides in memory.
    244 class OpenCLTextInMemory : public KernelLoaderSpec {
    245  public:
    246   OpenCLTextInMemory(port::StringPiece text, port::StringPiece kernelname);
    247   ~OpenCLTextInMemory() override {}
    249   // Returns the OpenCL text contents.
    250   const string &text() const { return text_; }
    252  private:
    253   // OpenCL translation unit text contents in memory.
    254   string text_;
    256   SE_DISALLOW_COPY_AND_ASSIGN(OpenCLTextInMemory);
    257 };
    259 // Kernel loader specification for a CUBIN blob that resides in memory.
    260 class CudaCubinInMemory : public KernelLoaderSpec {
    261  public:
    262   CudaCubinInMemory(const char *bytes, port::StringPiece kernelname);
    263   ~CudaCubinInMemory() override {}
    265   const char *bytes() const { return bytes_; }
    267  private:
    268   const char *bytes_;
    270   SE_DISALLOW_COPY_AND_ASSIGN(CudaCubinInMemory);
    271 };
    273 // Describes how to load a kernel on any subset of a number of target platforms.
    274 class MultiKernelLoaderSpec {
    275  public:
    276   explicit MultiKernelLoaderSpec(size_t arity);
    278   // Returns the number of arguments that this kernel accepts.
    279   size_t arity() const { return arity_; }
    281   // Convenience getters for testing whether these platform variants have
    282   // kernel loader specifications available.
    283   bool has_cuda_ptx_on_disk() const { return cuda_ptx_on_disk_ != nullptr; }
    284   bool has_cuda_cubin_on_disk() const { return cuda_cubin_on_disk_ != nullptr; }
    285   bool has_cuda_cubin_in_memory() const {
    286     return cuda_cubin_in_memory_ != nullptr;
    287   }
    288   bool has_cuda_ptx_in_memory() const { return cuda_ptx_in_memory_ != nullptr; }
    289   bool has_ocl_text_on_disk() const { return ocl_text_on_disk_ != nullptr; }
    290   bool has_ocl_binary_on_disk() const { return ocl_binary_on_disk_ != nullptr; }
    291   bool has_ocl_text_in_memory() const { return ocl_text_in_memory_ != nullptr; }
    293   // Accessors for platform variant kernel load specifications.
    294   // Precondition: corresponding has_* is true.
    295   const CudaPtxOnDisk &cuda_ptx_on_disk() const {
    296     CHECK(has_cuda_ptx_on_disk());
    297     return *cuda_ptx_on_disk_;
    298   }
    299   const CudaCubinOnDisk &cuda_cubin_on_disk() const {
    300     CHECK(has_cuda_cubin_on_disk());
    301     return *cuda_cubin_on_disk_;
    302   }
    303   const CudaCubinInMemory &cuda_cubin_in_memory() const {
    304     CHECK(has_cuda_cubin_in_memory());
    305     return *cuda_cubin_in_memory_;
    306   }
    307   const CudaPtxInMemory &cuda_ptx_in_memory() const {
    308     CHECK(has_cuda_ptx_in_memory());
    309     return *cuda_ptx_in_memory_;
    310   }
    311   const OpenCLTextOnDisk &ocl_text_on_disk() const {
    312     CHECK(has_ocl_text_on_disk());
    313     return *ocl_text_on_disk_;
    314   }
    315   const OpenCLBinaryOnDisk &ocl_binary_on_disk() const {
    316     CHECK(has_ocl_binary_on_disk());
    317     return *ocl_binary_on_disk_;
    318   }
    319   const OpenCLTextInMemory &ocl_text_in_memory() const {
    320     CHECK(has_ocl_text_in_memory());
    321     return *ocl_text_in_memory_;
    322   }
    324   // Builder-pattern-like methods for use in initializing a
    325   // MultiKernelLoaderSpec. Each of these should be used at most once for a
    326   // single MultiKernelLoaderSpec object. See file comment for example usage.
    327   //
    328   // Note that the kernelname parameter must be consistent with the kernel in
    329   // the PTX or OpenCL being loaded. Also be aware that in CUDA C++ the kernel
    330   // name may be mangled by the compiler if it is not declared in an
    331   // extern "C" scope.
    332   MultiKernelLoaderSpec *AddOpenCLTextOnDisk(port::StringPiece filename,
    333                                              port::StringPiece kernelname);
    334   MultiKernelLoaderSpec *AddOpenCLBinaryOnDisk(port::StringPiece filename,
    335                                                port::StringPiece kernelname);
    336   MultiKernelLoaderSpec *AddOpenCLTextInMemory(port::StringPiece ocl_text,
    337                                                port::StringPiece kernelname);
    338   MultiKernelLoaderSpec *AddCudaPtxOnDisk(port::StringPiece filename,
    339                                           port::StringPiece kernelname);
    340   MultiKernelLoaderSpec *AddCudaCubinOnDisk(port::StringPiece filename,
    341                                             port::StringPiece kernelname);
    342   MultiKernelLoaderSpec *AddCudaCubinInMemory(const char *cubin_bytes,
    343                                               port::StringPiece kernelname);
    344   MultiKernelLoaderSpec *AddCudaPtxInMemory(port::StringPiece ptx,
    345                                             port::StringPiece kernelname);
    346   MultiKernelLoaderSpec *AddCudaCompressedPtxInMemory(
    347       port::StringPiece ptx, port::StringPiece kernelname);
    348   MultiKernelLoaderSpec *AddCudaPtxInMemory(
    349       std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
    350       port::StringPiece kernelname);
    351   MultiKernelLoaderSpec *AddCudaCompressedPtxInMemory(
    352       std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
    353       port::StringPiece kernelname);
    355  private:
    356   std::unique_ptr<CudaPtxOnDisk>
    357       cuda_ptx_on_disk_;  // PTX text that resides in a file.
    358   std::unique_ptr<CudaCubinOnDisk>
    359       cuda_cubin_on_disk_;  // Binary CUDA program in a file.
    360   std::unique_ptr<CudaCubinInMemory>
    361       cuda_cubin_in_memory_;  // Binary CUDA program in memory.
    362   std::unique_ptr<CudaPtxInMemory>
    363       cuda_ptx_in_memory_;  // PTX text that resides in memory.
    364   std::unique_ptr<OpenCLTextOnDisk>
    365       ocl_text_on_disk_;  // OpenCL text that resides on disk.
    366   std::unique_ptr<OpenCLBinaryOnDisk>
    367       ocl_binary_on_disk_;  // OpenCL binary that resides on disk.
    368   std::unique_ptr<OpenCLTextInMemory>
    369       ocl_text_in_memory_;  // OpenCL text that resides in memory.
    371   // Number of parameters that the kernel takes. (This is nicer to have in a
    372   // constexpr than having to determine it from the types via template
    373   // metaprogramming).
    374   size_t arity_;
    375 };
    377 }  // namespace gputools
    378 }  // namespace perftools