Home | History | Annotate | Download | only in stream_executor
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // Kernel-loader specs are structures that describe how to load a data-parallel
     17 // kernel on a given platform for subsequent launching. Headers that instantiate
     18 // these data structures will typically be auto-generated. However, users can
     19 // also instantiate them by hand.
     20 //
     21 // A kernel with the same exact functionality and type signature may be
     22 // implemented on several different platforms. Typical usage is to create a
     23 // singleton that describes how to load a kernel on the various supported
     24 // platforms:
     25 //
     26 //  static const MultiKernelLoaderSpec &SaxpySpec() {
     27 //    static auto *mkls =
     28 //        (new MultiKernelLoaderSpec{4 /* = arity */})
     29 //            ->AddCudaPtxOnDisk(ptx_file_path, ptx_kernelname)
     30 //            ->AddOpenCLTextOnDisk(opencl_text_file_path, ocl_kernelname);
     31 //    };
     32 //
     33 //    return *mkls;
     34 //  }
     35 //
     36 // This lazily instantiates an object that describes how to load CUDA PTX
     37 // present on disk that implements saxpy for the for the CUDA platform, or
     38 // OpenCL text present on disk that implements saxpy for an OpenCL-based
     39 // platform. The CudaPtxOnDisk and OpenCLTextOnDisk objects are subtypes of
     40 // KernelLoaderSpec -- KernelLoaderSpec describes how to load a kernel for
     41 // subsequent launching on a single platform.
     42 //
     43 // For the loader functionality that accepts these KernelLoaderSpecs in order
     44 // to grab the kernel appropriately, see StreamExecutor::GetKernel().
     45 
     46 #ifndef TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_
     47 #define TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_
     48 
     49 #include <stddef.h>
     50 #include <map>
     51 #include <memory>
     52 #include "tensorflow/stream_executor/platform/port.h"
     53 
     54 #include "tensorflow/stream_executor/lib/stringpiece.h"
     55 #include "tensorflow/stream_executor/platform/logging.h"
     56 #include "tensorflow/stream_executor/platform/mutex.h"
     57 #include "tensorflow/stream_executor/platform/port.h"
     58 
     59 namespace perftools {
     60 namespace gputools {
     61 
     62 // Describes how to load a kernel on a target platform.
     63 //
     64 // This is an abstract base class, subclassed for specific platforms.
     65 // The filename_or_text field represents the program location (i.e. PTX or
     66 // OpenCL loadable translation unit path) and is simply stored; whether it is a
     67 // filename or text is exposed via more specifically named accessors in
     68 // subclasses.
     69 //
     70 // These kernel loader specifications are typically auto-generated into header
     71 // files at build time, but can also be specified manually.
     72 class KernelLoaderSpec {
     73  public:
     74   virtual ~KernelLoaderSpec() {}
     75 
     76   // Returns the kernel name to load out of the program.
     77   const string &kernelname() const { return kernelname_; }
     78 
     79  protected:
     80   explicit KernelLoaderSpec(port::StringPiece kernelname);
     81 
     82  private:
     83   // The kernel name that should be loaded out of the program description given
     84   // above.
     85   string kernelname_;
     86 
     87   SE_DISALLOW_COPY_AND_ASSIGN(KernelLoaderSpec);
     88 };
     89 
     90 // An abstract kernel loader spec that has an associated file path, where
     91 // there's a canonical suffix for the filename; e.g. see CudaPtxOnDisk whose
     92 // canonical filename suffix is ".ptx".
     93 class OnDiskKernelLoaderSpec : public KernelLoaderSpec {
     94  public:
     95   ~OnDiskKernelLoaderSpec() override {}
     96 
     97   // Returns the path to the on-disk loadable kernel file.
     98   const string &filename() const { return filename_; }
     99 
    100   // Returns the canonical suffix for this on-disk kernel loader spec format;
    101   // e.g. PTX files on disk have a canonical suffix of ".ptx".
    102   virtual const char *CanonicalSuffix() const = 0;
    103 
    104  protected:
    105   OnDiskKernelLoaderSpec(port::StringPiece filename,
    106                          port::StringPiece kernelname);
    107 
    108   string filename_;
    109 
    110  private:
    111   SE_DISALLOW_COPY_AND_ASSIGN(OnDiskKernelLoaderSpec);
    112 };
    113 
    114 // Kernel loader specification for PTX text that resides on disk.
    115 class CudaPtxOnDisk : public OnDiskKernelLoaderSpec {
    116  public:
    117   CudaPtxOnDisk(port::StringPiece filename, port::StringPiece kernelname);
    118   ~CudaPtxOnDisk() override {}
    119 
    120   const char *CanonicalSuffix() const override { return ".ptx"; }
    121 
    122  private:
    123   SE_DISALLOW_COPY_AND_ASSIGN(CudaPtxOnDisk);
    124 };
    125 
    126 // Kernel loader specification for CUBIN binary that resides on disk.
    127 class CudaCubinOnDisk : public OnDiskKernelLoaderSpec {
    128  public:
    129   CudaCubinOnDisk(port::StringPiece filename, port::StringPiece kernelname);
    130   ~CudaCubinOnDisk() override {}
    131 
    132   const string &filename() const { return filename_; }
    133 
    134   const char *CanonicalSuffix() const override { return ".cubin"; }
    135 
    136  private:
    137   string filename_;
    138 
    139   SE_DISALLOW_COPY_AND_ASSIGN(CudaCubinOnDisk);
    140 };
    141 
    142 // Kernel loader specification for PTX text that resides in memory.
    143 class CudaPtxInMemory : public KernelLoaderSpec {
    144  public:
    145   // Components: compute capability major number, compute capability minor
    146   // number, and PTX source.
    147   typedef std::tuple<int, int, port::StringPiece> PtxSpec;
    148 
    149   // Single-PTX constructor. Adds the provided PTX version with an unknown
    150   // compute capability. Since the CC is unknown, the PTX is assumed to be very
    151   // generally usable - in other words, PTX specified in this manner is VERY
    152   // likely to be used as the default! Note that the PTX can be compressed,
    153   // which is indicated by the argument ptx_compressed.
    154   //
    155   // Warning: the string backing the provided port::StringPiece ptx must outlive this
    156   // instance.
    157   CudaPtxInMemory(port::StringPiece ptx, port::StringPiece kernelname,
    158                   bool ptx_compressed = false);
    159 
    160   // Multiple-PTX-version constructor. Adds each item in spec_list to this
    161   // object. Note that the PTX can be compressed, which is indicated by the
    162   // argument ptx_compressed.
    163   CudaPtxInMemory(const std::initializer_list<PtxSpec> &spec_list,
    164                   port::StringPiece kernel_name, bool ptx_compressed = false);
    165   ~CudaPtxInMemory() override {}
    166 
    167   // Add the PTX implementation described by ptx_spec to this object. On
    168   // collision (i.e., if a version with the same compute_capability already
    169   // exists), the existing implementation will be overwritten.
    170   void AddSpec(PtxSpec ptx_spec);
    171 
    172   // Returns pointer to the ptx of available implementation with the
    173   // lowest-valued compute capability. For example, if PTX written to CC2.0,
    174   // 3.0, and 3.5 are all available, the version for CC2.0 will be set. Returns
    175   // nullptr on failed lookup (if any version is not available).
    176   // When the ptx is compressed, returns the decompressed ptx.
    177   const char *default_text() const;
    178 
    179   // Similar to default_text().
    180   // When the ptx is compressed, returns the decompressed ptx.
    181   const char *original_default_text() const;
    182 
    183   // Returns pointer to the ptx for the requested compute capability.
    184   // Returns nullptr on failed lookup (if the requested version is not
    185   // available).
    186   // When the ptx is compressed, returns the decompressed ptx.
    187   const char *text(int compute_capability_major,
    188                    int compute_capability_minor) const;
    189 
    190   // Similar to text().
    191   // When the ptx is compressed, returns the original compressed ptx.
    192   const char *original_text(int compute_capability_major,
    193                             int compute_capability_minor) const;
    194 
    195   // Decompresses the PTX string using bzip2.
    196   static string DecompressPtx(const char *ptx);
    197 
    198  private:
    199   // PTX translation unit text contents in memory. The key is of as a tuple
    200   // "<cc_major>,<cc_minor>", i.e., "2,0", "3,0", "3,5". Because CC's
    201   // represented in this way have a clear sorting order, map::begin() will give
    202   // the lowest-numbered version available, i.e. the default.
    203   std::map<std::tuple<int, int>, const char *,
    204            bool (*)(const std::tuple<int, int> &, const std::tuple<int, int> &)>
    205       ptx_by_compute_capability_;
    206 
    207   // Stores all decompressed ptx strings, with original ptx string as keys.
    208   // It is marked as mutable for lazy decompression.
    209   mutable std::map<const char *, string> decompressed_ptx_;
    210   mutable mutex mu_;
    211 
    212   // Defines the minimum compute capability possible. Used when PTX has no
    213   // compute capability specified (in the single-PTX constructor).
    214   static const std::tuple<int, int> kMinimumCapability;
    215 
    216   SE_DISALLOW_COPY_AND_ASSIGN(CudaPtxInMemory);
    217 };
    218 
    219 // Kernel loader specification for OpenCL text that resides on disk.
    220 class OpenCLTextOnDisk : public OnDiskKernelLoaderSpec {
    221  public:
    222   OpenCLTextOnDisk(port::StringPiece filename, port::StringPiece kernelname);
    223   ~OpenCLTextOnDisk() override {}
    224 
    225   const char *CanonicalSuffix() const override { return ".ocl"; }
    226 
    227  private:
    228   SE_DISALLOW_COPY_AND_ASSIGN(OpenCLTextOnDisk);
    229 };
    230 
    231 // Kernel loader specification for OpenCL binary that resides on disk.
    232 class OpenCLBinaryOnDisk : public OnDiskKernelLoaderSpec {
    233  public:
    234   OpenCLBinaryOnDisk(port::StringPiece filename, port::StringPiece kernelname);
    235   ~OpenCLBinaryOnDisk() override {}
    236 
    237   const char *CanonicalSuffix() const override { return ".aocx"; }
    238 
    239  private:
    240   SE_DISALLOW_COPY_AND_ASSIGN(OpenCLBinaryOnDisk);
    241 };
    242 
    243 // Kernel loader specification for OpenCL text that resides in memory.
    244 class OpenCLTextInMemory : public KernelLoaderSpec {
    245  public:
    246   OpenCLTextInMemory(port::StringPiece text, port::StringPiece kernelname);
    247   ~OpenCLTextInMemory() override {}
    248 
    249   // Returns the OpenCL text contents.
    250   const string &text() const { return text_; }
    251 
    252  private:
    253   // OpenCL translation unit text contents in memory.
    254   string text_;
    255 
    256   SE_DISALLOW_COPY_AND_ASSIGN(OpenCLTextInMemory);
    257 };
    258 
    259 // Kernel loader specification for a CUBIN blob that resides in memory.
    260 class CudaCubinInMemory : public KernelLoaderSpec {
    261  public:
    262   CudaCubinInMemory(const char *bytes, port::StringPiece kernelname);
    263   ~CudaCubinInMemory() override {}
    264 
    265   const char *bytes() const { return bytes_; }
    266 
    267  private:
    268   const char *bytes_;
    269 
    270   SE_DISALLOW_COPY_AND_ASSIGN(CudaCubinInMemory);
    271 };
    272 
    273 // Describes how to load a kernel on any subset of a number of target platforms.
    274 class MultiKernelLoaderSpec {
    275  public:
    276   explicit MultiKernelLoaderSpec(size_t arity);
    277 
    278   // Returns the number of arguments that this kernel accepts.
    279   size_t arity() const { return arity_; }
    280 
    281   // Convenience getters for testing whether these platform variants have
    282   // kernel loader specifications available.
    283   bool has_cuda_ptx_on_disk() const { return cuda_ptx_on_disk_ != nullptr; }
    284   bool has_cuda_cubin_on_disk() const { return cuda_cubin_on_disk_ != nullptr; }
    285   bool has_cuda_cubin_in_memory() const {
    286     return cuda_cubin_in_memory_ != nullptr;
    287   }
    288   bool has_cuda_ptx_in_memory() const { return cuda_ptx_in_memory_ != nullptr; }
    289   bool has_ocl_text_on_disk() const { return ocl_text_on_disk_ != nullptr; }
    290   bool has_ocl_binary_on_disk() const { return ocl_binary_on_disk_ != nullptr; }
    291   bool has_ocl_text_in_memory() const { return ocl_text_in_memory_ != nullptr; }
    292 
    293   // Accessors for platform variant kernel load specifications.
    294   // Precondition: corresponding has_* is true.
    295   const CudaPtxOnDisk &cuda_ptx_on_disk() const {
    296     CHECK(has_cuda_ptx_on_disk());
    297     return *cuda_ptx_on_disk_;
    298   }
    299   const CudaCubinOnDisk &cuda_cubin_on_disk() const {
    300     CHECK(has_cuda_cubin_on_disk());
    301     return *cuda_cubin_on_disk_;
    302   }
    303   const CudaCubinInMemory &cuda_cubin_in_memory() const {
    304     CHECK(has_cuda_cubin_in_memory());
    305     return *cuda_cubin_in_memory_;
    306   }
    307   const CudaPtxInMemory &cuda_ptx_in_memory() const {
    308     CHECK(has_cuda_ptx_in_memory());
    309     return *cuda_ptx_in_memory_;
    310   }
    311   const OpenCLTextOnDisk &ocl_text_on_disk() const {
    312     CHECK(has_ocl_text_on_disk());
    313     return *ocl_text_on_disk_;
    314   }
    315   const OpenCLBinaryOnDisk &ocl_binary_on_disk() const {
    316     CHECK(has_ocl_binary_on_disk());
    317     return *ocl_binary_on_disk_;
    318   }
    319   const OpenCLTextInMemory &ocl_text_in_memory() const {
    320     CHECK(has_ocl_text_in_memory());
    321     return *ocl_text_in_memory_;
    322   }
    323 
    324   // Builder-pattern-like methods for use in initializing a
    325   // MultiKernelLoaderSpec. Each of these should be used at most once for a
    326   // single MultiKernelLoaderSpec object. See file comment for example usage.
    327   //
    328   // Note that the kernelname parameter must be consistent with the kernel in
    329   // the PTX or OpenCL being loaded. Also be aware that in CUDA C++ the kernel
    330   // name may be mangled by the compiler if it is not declared in an
    331   // extern "C" scope.
    332   MultiKernelLoaderSpec *AddOpenCLTextOnDisk(port::StringPiece filename,
    333                                              port::StringPiece kernelname);
    334   MultiKernelLoaderSpec *AddOpenCLBinaryOnDisk(port::StringPiece filename,
    335                                                port::StringPiece kernelname);
    336   MultiKernelLoaderSpec *AddOpenCLTextInMemory(port::StringPiece ocl_text,
    337                                                port::StringPiece kernelname);
    338   MultiKernelLoaderSpec *AddCudaPtxOnDisk(port::StringPiece filename,
    339                                           port::StringPiece kernelname);
    340   MultiKernelLoaderSpec *AddCudaCubinOnDisk(port::StringPiece filename,
    341                                             port::StringPiece kernelname);
    342   MultiKernelLoaderSpec *AddCudaCubinInMemory(const char *cubin_bytes,
    343                                               port::StringPiece kernelname);
    344   MultiKernelLoaderSpec *AddCudaPtxInMemory(port::StringPiece ptx,
    345                                             port::StringPiece kernelname);
    346   MultiKernelLoaderSpec *AddCudaCompressedPtxInMemory(
    347       port::StringPiece ptx, port::StringPiece kernelname);
    348   MultiKernelLoaderSpec *AddCudaPtxInMemory(
    349       std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
    350       port::StringPiece kernelname);
    351   MultiKernelLoaderSpec *AddCudaCompressedPtxInMemory(
    352       std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
    353       port::StringPiece kernelname);
    354 
    355  private:
    356   std::unique_ptr<CudaPtxOnDisk>
    357       cuda_ptx_on_disk_;  // PTX text that resides in a file.
    358   std::unique_ptr<CudaCubinOnDisk>
    359       cuda_cubin_on_disk_;  // Binary CUDA program in a file.
    360   std::unique_ptr<CudaCubinInMemory>
    361       cuda_cubin_in_memory_;  // Binary CUDA program in memory.
    362   std::unique_ptr<CudaPtxInMemory>
    363       cuda_ptx_in_memory_;  // PTX text that resides in memory.
    364   std::unique_ptr<OpenCLTextOnDisk>
    365       ocl_text_on_disk_;  // OpenCL text that resides on disk.
    366   std::unique_ptr<OpenCLBinaryOnDisk>
    367       ocl_binary_on_disk_;  // OpenCL binary that resides on disk.
    368   std::unique_ptr<OpenCLTextInMemory>
    369       ocl_text_in_memory_;  // OpenCL text that resides in memory.
    370 
    371   // Number of parameters that the kernel takes. (This is nicer to have in a
    372   // constexpr than having to determine it from the types via template
    373   // metaprogramming).
    374   size_t arity_;
    375 };
    376 
    377 }  // namespace gputools
    378 }  // namespace perftools
    379 
    380 #endif  // TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_
    381