1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // Kernel-loader specs are structures that describe how to load a data-parallel 17 // kernel on a given platform for subsequent launching. Headers that instantiate 18 // these data structures will typically be auto-generated. However, users can 19 // also instantiate them by hand. 20 // 21 // A kernel with the same exact functionality and type signature may be 22 // implemented on several different platforms. Typical usage is to create a 23 // singleton that describes how to load a kernel on the various supported 24 // platforms: 25 // 26 // static const MultiKernelLoaderSpec &SaxpySpec() { 27 // static auto *mkls = 28 // (new MultiKernelLoaderSpec{4 /* = arity */}) 29 // ->AddCudaPtxOnDisk(ptx_file_path, ptx_kernelname) 30 // ->AddOpenCLTextOnDisk(opencl_text_file_path, ocl_kernelname); 31 // }; 32 // 33 // return *mkls; 34 // } 35 // 36 // This lazily instantiates an object that describes how to load CUDA PTX 37 // present on disk that implements saxpy for the for the CUDA platform, or 38 // OpenCL text present on disk that implements saxpy for an OpenCL-based 39 // platform. The CudaPtxOnDisk and OpenCLTextOnDisk objects are subtypes of 40 // KernelLoaderSpec -- KernelLoaderSpec describes how to load a kernel for 41 // subsequent launching on a single platform. 42 // 43 // For the loader functionality that accepts these KernelLoaderSpecs in order 44 // to grab the kernel appropriately, see StreamExecutor::GetKernel(). 45 46 #ifndef TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_ 47 #define TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_ 48 49 #include <stddef.h> 50 #include <map> 51 #include <memory> 52 #include "tensorflow/stream_executor/platform/port.h" 53 54 #include "tensorflow/stream_executor/lib/stringpiece.h" 55 #include "tensorflow/stream_executor/platform/logging.h" 56 #include "tensorflow/stream_executor/platform/mutex.h" 57 #include "tensorflow/stream_executor/platform/port.h" 58 59 namespace perftools { 60 namespace gputools { 61 62 // Describes how to load a kernel on a target platform. 63 // 64 // This is an abstract base class, subclassed for specific platforms. 65 // The filename_or_text field represents the program location (i.e. PTX or 66 // OpenCL loadable translation unit path) and is simply stored; whether it is a 67 // filename or text is exposed via more specifically named accessors in 68 // subclasses. 69 // 70 // These kernel loader specifications are typically auto-generated into header 71 // files at build time, but can also be specified manually. 72 class KernelLoaderSpec { 73 public: 74 virtual ~KernelLoaderSpec() {} 75 76 // Returns the kernel name to load out of the program. 77 const string &kernelname() const { return kernelname_; } 78 79 protected: 80 explicit KernelLoaderSpec(port::StringPiece kernelname); 81 82 private: 83 // The kernel name that should be loaded out of the program description given 84 // above. 85 string kernelname_; 86 87 SE_DISALLOW_COPY_AND_ASSIGN(KernelLoaderSpec); 88 }; 89 90 // An abstract kernel loader spec that has an associated file path, where 91 // there's a canonical suffix for the filename; e.g. see CudaPtxOnDisk whose 92 // canonical filename suffix is ".ptx". 93 class OnDiskKernelLoaderSpec : public KernelLoaderSpec { 94 public: 95 ~OnDiskKernelLoaderSpec() override {} 96 97 // Returns the path to the on-disk loadable kernel file. 98 const string &filename() const { return filename_; } 99 100 // Returns the canonical suffix for this on-disk kernel loader spec format; 101 // e.g. PTX files on disk have a canonical suffix of ".ptx". 102 virtual const char *CanonicalSuffix() const = 0; 103 104 protected: 105 OnDiskKernelLoaderSpec(port::StringPiece filename, 106 port::StringPiece kernelname); 107 108 string filename_; 109 110 private: 111 SE_DISALLOW_COPY_AND_ASSIGN(OnDiskKernelLoaderSpec); 112 }; 113 114 // Kernel loader specification for PTX text that resides on disk. 115 class CudaPtxOnDisk : public OnDiskKernelLoaderSpec { 116 public: 117 CudaPtxOnDisk(port::StringPiece filename, port::StringPiece kernelname); 118 ~CudaPtxOnDisk() override {} 119 120 const char *CanonicalSuffix() const override { return ".ptx"; } 121 122 private: 123 SE_DISALLOW_COPY_AND_ASSIGN(CudaPtxOnDisk); 124 }; 125 126 // Kernel loader specification for CUBIN binary that resides on disk. 127 class CudaCubinOnDisk : public OnDiskKernelLoaderSpec { 128 public: 129 CudaCubinOnDisk(port::StringPiece filename, port::StringPiece kernelname); 130 ~CudaCubinOnDisk() override {} 131 132 const string &filename() const { return filename_; } 133 134 const char *CanonicalSuffix() const override { return ".cubin"; } 135 136 private: 137 string filename_; 138 139 SE_DISALLOW_COPY_AND_ASSIGN(CudaCubinOnDisk); 140 }; 141 142 // Kernel loader specification for PTX text that resides in memory. 143 class CudaPtxInMemory : public KernelLoaderSpec { 144 public: 145 // Components: compute capability major number, compute capability minor 146 // number, and PTX source. 147 typedef std::tuple<int, int, port::StringPiece> PtxSpec; 148 149 // Single-PTX constructor. Adds the provided PTX version with an unknown 150 // compute capability. Since the CC is unknown, the PTX is assumed to be very 151 // generally usable - in other words, PTX specified in this manner is VERY 152 // likely to be used as the default! Note that the PTX can be compressed, 153 // which is indicated by the argument ptx_compressed. 154 // 155 // Warning: the string backing the provided port::StringPiece ptx must outlive this 156 // instance. 157 CudaPtxInMemory(port::StringPiece ptx, port::StringPiece kernelname, 158 bool ptx_compressed = false); 159 160 // Multiple-PTX-version constructor. Adds each item in spec_list to this 161 // object. Note that the PTX can be compressed, which is indicated by the 162 // argument ptx_compressed. 163 CudaPtxInMemory(const std::initializer_list<PtxSpec> &spec_list, 164 port::StringPiece kernel_name, bool ptx_compressed = false); 165 ~CudaPtxInMemory() override {} 166 167 // Add the PTX implementation described by ptx_spec to this object. On 168 // collision (i.e., if a version with the same compute_capability already 169 // exists), the existing implementation will be overwritten. 170 void AddSpec(PtxSpec ptx_spec); 171 172 // Returns pointer to the ptx of available implementation with the 173 // lowest-valued compute capability. For example, if PTX written to CC2.0, 174 // 3.0, and 3.5 are all available, the version for CC2.0 will be set. Returns 175 // nullptr on failed lookup (if any version is not available). 176 // When the ptx is compressed, returns the decompressed ptx. 177 const char *default_text() const; 178 179 // Similar to default_text(). 180 // When the ptx is compressed, returns the decompressed ptx. 181 const char *original_default_text() const; 182 183 // Returns pointer to the ptx for the requested compute capability. 184 // Returns nullptr on failed lookup (if the requested version is not 185 // available). 186 // When the ptx is compressed, returns the decompressed ptx. 187 const char *text(int compute_capability_major, 188 int compute_capability_minor) const; 189 190 // Similar to text(). 191 // When the ptx is compressed, returns the original compressed ptx. 192 const char *original_text(int compute_capability_major, 193 int compute_capability_minor) const; 194 195 // Decompresses the PTX string using bzip2. 196 static string DecompressPtx(const char *ptx); 197 198 private: 199 // PTX translation unit text contents in memory. The key is of as a tuple 200 // "<cc_major>,<cc_minor>", i.e., "2,0", "3,0", "3,5". Because CC's 201 // represented in this way have a clear sorting order, map::begin() will give 202 // the lowest-numbered version available, i.e. the default. 203 std::map<std::tuple<int, int>, const char *, 204 bool (*)(const std::tuple<int, int> &, const std::tuple<int, int> &)> 205 ptx_by_compute_capability_; 206 207 // Stores all decompressed ptx strings, with original ptx string as keys. 208 // It is marked as mutable for lazy decompression. 209 mutable std::map<const char *, string> decompressed_ptx_; 210 mutable mutex mu_; 211 212 // Defines the minimum compute capability possible. Used when PTX has no 213 // compute capability specified (in the single-PTX constructor). 214 static const std::tuple<int, int> kMinimumCapability; 215 216 SE_DISALLOW_COPY_AND_ASSIGN(CudaPtxInMemory); 217 }; 218 219 // Kernel loader specification for OpenCL text that resides on disk. 220 class OpenCLTextOnDisk : public OnDiskKernelLoaderSpec { 221 public: 222 OpenCLTextOnDisk(port::StringPiece filename, port::StringPiece kernelname); 223 ~OpenCLTextOnDisk() override {} 224 225 const char *CanonicalSuffix() const override { return ".ocl"; } 226 227 private: 228 SE_DISALLOW_COPY_AND_ASSIGN(OpenCLTextOnDisk); 229 }; 230 231 // Kernel loader specification for OpenCL binary that resides on disk. 232 class OpenCLBinaryOnDisk : public OnDiskKernelLoaderSpec { 233 public: 234 OpenCLBinaryOnDisk(port::StringPiece filename, port::StringPiece kernelname); 235 ~OpenCLBinaryOnDisk() override {} 236 237 const char *CanonicalSuffix() const override { return ".aocx"; } 238 239 private: 240 SE_DISALLOW_COPY_AND_ASSIGN(OpenCLBinaryOnDisk); 241 }; 242 243 // Kernel loader specification for OpenCL text that resides in memory. 244 class OpenCLTextInMemory : public KernelLoaderSpec { 245 public: 246 OpenCLTextInMemory(port::StringPiece text, port::StringPiece kernelname); 247 ~OpenCLTextInMemory() override {} 248 249 // Returns the OpenCL text contents. 250 const string &text() const { return text_; } 251 252 private: 253 // OpenCL translation unit text contents in memory. 254 string text_; 255 256 SE_DISALLOW_COPY_AND_ASSIGN(OpenCLTextInMemory); 257 }; 258 259 // Kernel loader specification for a CUBIN blob that resides in memory. 260 class CudaCubinInMemory : public KernelLoaderSpec { 261 public: 262 CudaCubinInMemory(const char *bytes, port::StringPiece kernelname); 263 ~CudaCubinInMemory() override {} 264 265 const char *bytes() const { return bytes_; } 266 267 private: 268 const char *bytes_; 269 270 SE_DISALLOW_COPY_AND_ASSIGN(CudaCubinInMemory); 271 }; 272 273 // Describes how to load a kernel on any subset of a number of target platforms. 274 class MultiKernelLoaderSpec { 275 public: 276 explicit MultiKernelLoaderSpec(size_t arity); 277 278 // Returns the number of arguments that this kernel accepts. 279 size_t arity() const { return arity_; } 280 281 // Convenience getters for testing whether these platform variants have 282 // kernel loader specifications available. 283 bool has_cuda_ptx_on_disk() const { return cuda_ptx_on_disk_ != nullptr; } 284 bool has_cuda_cubin_on_disk() const { return cuda_cubin_on_disk_ != nullptr; } 285 bool has_cuda_cubin_in_memory() const { 286 return cuda_cubin_in_memory_ != nullptr; 287 } 288 bool has_cuda_ptx_in_memory() const { return cuda_ptx_in_memory_ != nullptr; } 289 bool has_ocl_text_on_disk() const { return ocl_text_on_disk_ != nullptr; } 290 bool has_ocl_binary_on_disk() const { return ocl_binary_on_disk_ != nullptr; } 291 bool has_ocl_text_in_memory() const { return ocl_text_in_memory_ != nullptr; } 292 293 // Accessors for platform variant kernel load specifications. 294 // Precondition: corresponding has_* is true. 295 const CudaPtxOnDisk &cuda_ptx_on_disk() const { 296 CHECK(has_cuda_ptx_on_disk()); 297 return *cuda_ptx_on_disk_; 298 } 299 const CudaCubinOnDisk &cuda_cubin_on_disk() const { 300 CHECK(has_cuda_cubin_on_disk()); 301 return *cuda_cubin_on_disk_; 302 } 303 const CudaCubinInMemory &cuda_cubin_in_memory() const { 304 CHECK(has_cuda_cubin_in_memory()); 305 return *cuda_cubin_in_memory_; 306 } 307 const CudaPtxInMemory &cuda_ptx_in_memory() const { 308 CHECK(has_cuda_ptx_in_memory()); 309 return *cuda_ptx_in_memory_; 310 } 311 const OpenCLTextOnDisk &ocl_text_on_disk() const { 312 CHECK(has_ocl_text_on_disk()); 313 return *ocl_text_on_disk_; 314 } 315 const OpenCLBinaryOnDisk &ocl_binary_on_disk() const { 316 CHECK(has_ocl_binary_on_disk()); 317 return *ocl_binary_on_disk_; 318 } 319 const OpenCLTextInMemory &ocl_text_in_memory() const { 320 CHECK(has_ocl_text_in_memory()); 321 return *ocl_text_in_memory_; 322 } 323 324 // Builder-pattern-like methods for use in initializing a 325 // MultiKernelLoaderSpec. Each of these should be used at most once for a 326 // single MultiKernelLoaderSpec object. See file comment for example usage. 327 // 328 // Note that the kernelname parameter must be consistent with the kernel in 329 // the PTX or OpenCL being loaded. Also be aware that in CUDA C++ the kernel 330 // name may be mangled by the compiler if it is not declared in an 331 // extern "C" scope. 332 MultiKernelLoaderSpec *AddOpenCLTextOnDisk(port::StringPiece filename, 333 port::StringPiece kernelname); 334 MultiKernelLoaderSpec *AddOpenCLBinaryOnDisk(port::StringPiece filename, 335 port::StringPiece kernelname); 336 MultiKernelLoaderSpec *AddOpenCLTextInMemory(port::StringPiece ocl_text, 337 port::StringPiece kernelname); 338 MultiKernelLoaderSpec *AddCudaPtxOnDisk(port::StringPiece filename, 339 port::StringPiece kernelname); 340 MultiKernelLoaderSpec *AddCudaCubinOnDisk(port::StringPiece filename, 341 port::StringPiece kernelname); 342 MultiKernelLoaderSpec *AddCudaCubinInMemory(const char *cubin_bytes, 343 port::StringPiece kernelname); 344 MultiKernelLoaderSpec *AddCudaPtxInMemory(port::StringPiece ptx, 345 port::StringPiece kernelname); 346 MultiKernelLoaderSpec *AddCudaCompressedPtxInMemory( 347 port::StringPiece ptx, port::StringPiece kernelname); 348 MultiKernelLoaderSpec *AddCudaPtxInMemory( 349 std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list, 350 port::StringPiece kernelname); 351 MultiKernelLoaderSpec *AddCudaCompressedPtxInMemory( 352 std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list, 353 port::StringPiece kernelname); 354 355 private: 356 std::unique_ptr<CudaPtxOnDisk> 357 cuda_ptx_on_disk_; // PTX text that resides in a file. 358 std::unique_ptr<CudaCubinOnDisk> 359 cuda_cubin_on_disk_; // Binary CUDA program in a file. 360 std::unique_ptr<CudaCubinInMemory> 361 cuda_cubin_in_memory_; // Binary CUDA program in memory. 362 std::unique_ptr<CudaPtxInMemory> 363 cuda_ptx_in_memory_; // PTX text that resides in memory. 364 std::unique_ptr<OpenCLTextOnDisk> 365 ocl_text_on_disk_; // OpenCL text that resides on disk. 366 std::unique_ptr<OpenCLBinaryOnDisk> 367 ocl_binary_on_disk_; // OpenCL binary that resides on disk. 368 std::unique_ptr<OpenCLTextInMemory> 369 ocl_text_in_memory_; // OpenCL text that resides in memory. 370 371 // Number of parameters that the kernel takes. (This is nicer to have in a 372 // constexpr than having to determine it from the types via template 373 // metaprogramming). 374 size_t arity_; 375 }; 376 377 } // namespace gputools 378 } // namespace perftools 379 380 #endif // TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_ 381