1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/stream_executor/kernel_spec.h" 17 18 19 namespace perftools { 20 namespace gputools { 21 22 KernelLoaderSpec::KernelLoaderSpec(port::StringPiece kernelname) 23 : kernelname_(kernelname.ToString()) {} 24 25 OnDiskKernelLoaderSpec::OnDiskKernelLoaderSpec(port::StringPiece filename, 26 port::StringPiece kernelname) 27 : KernelLoaderSpec(kernelname), filename_(filename.ToString()) {} 28 29 CudaPtxOnDisk::CudaPtxOnDisk(port::StringPiece filename, 30 port::StringPiece kernelname) 31 : OnDiskKernelLoaderSpec(filename, kernelname) {} 32 33 CudaCubinOnDisk::CudaCubinOnDisk(port::StringPiece filename, 34 port::StringPiece kernelname) 35 : OnDiskKernelLoaderSpec(filename, kernelname) {} 36 37 CudaCubinInMemory::CudaCubinInMemory(const char *bytes, 38 port::StringPiece kernelname) 39 : KernelLoaderSpec(kernelname), bytes_(bytes) {} 40 41 bool CompareComputeCapability(const std::tuple<int, int> &lhs, 42 const std::tuple<int, int> &rhs) { 43 return std::get<0>(lhs) < std::get<0>(rhs) || 44 (std::get<0>(lhs) == std::get<0>(rhs) && 45 std::get<1>(lhs) < std::get<1>(rhs)); 46 } 47 48 const std::tuple<int, int> CudaPtxInMemory::kMinimumCapability{1, 0}; 49 50 CudaPtxInMemory::CudaPtxInMemory(port::StringPiece ptx, 51 port::StringPiece kernel_name, 52 bool ptx_compressed) 53 : KernelLoaderSpec(kernel_name), 54 ptx_by_compute_capability_(CompareComputeCapability) { 55 if (ptx_compressed) { 56 // Lazy decompression. Put an empty string in decompressed_ptx_ showing that 57 // the original ptx is compressed. 58 decompressed_ptx_[ptx.data()] = ""; 59 } 60 ptx_by_compute_capability_[kMinimumCapability] = ptx.data(); 61 } 62 63 CudaPtxInMemory::CudaPtxInMemory( 64 const std::initializer_list<CudaPtxInMemory::PtxSpec> &spec_list, 65 port::StringPiece kernel_name, bool ptx_compressed) 66 : KernelLoaderSpec(kernel_name), 67 ptx_by_compute_capability_(CompareComputeCapability) { 68 for (const auto &spec : spec_list) { 69 int major, minor; 70 port::StringPiece ptx; 71 std::tie(major, minor, ptx) = spec; 72 if (ptx_compressed) { 73 // Lazy decompression. Put an empty string in decompressed_ptx_ showing 74 // that the original ptx is compressed. 75 decompressed_ptx_[ptx.data()] = ""; 76 } 77 ptx_by_compute_capability_[std::tuple<int, int>{major, minor}] = ptx.data(); 78 } 79 } 80 81 string CudaPtxInMemory::DecompressPtx(const char *ptx) { 82 // Get the length of the PTX string from the beginning of the buffer. 83 uint64 ptx_length = *reinterpret_cast<const uint64 *>(ptx); 84 // Get the PTX string from the buffer with offset and length. 85 string compressed_ptx(ptx + sizeof(uint64), 86 ptx + sizeof(uint64) + ptx_length); 87 string decompressed_ptx; 88 // Decompress the PTX string with bzip2. 89 LOG(FATAL) << "bzip2 decompression is not supported yet."; 90 return decompressed_ptx; 91 } 92 93 const char *CudaPtxInMemory::default_text() const { 94 if (ptx_by_compute_capability_.empty()) { 95 return nullptr; 96 } 97 98 mutex_lock lock{mu_}; 99 100 auto ptx = ptx_by_compute_capability_.begin()->second; 101 // Check if there is an entry in decompressed ptx table. 102 auto decompressed_ptx_iter = decompressed_ptx_.find(ptx); 103 if (decompressed_ptx_iter != decompressed_ptx_.end()) { 104 // If the decompressed string is empty, which means the ptx hasn't been 105 // decompressed, decompress it here. 106 if (decompressed_ptx_iter->second.empty()) { 107 decompressed_ptx_iter->second = DecompressPtx(ptx); 108 } 109 return decompressed_ptx_iter->second.c_str(); 110 } 111 return ptx; 112 } 113 114 const char *CudaPtxInMemory::original_default_text() const { 115 if (ptx_by_compute_capability_.empty()) { 116 return nullptr; 117 } 118 119 return ptx_by_compute_capability_.begin()->second; 120 } 121 122 const char *CudaPtxInMemory::text(int compute_capability_major, 123 int compute_capability_minor) const { 124 std::tuple<int, int> capability{compute_capability_major, 125 compute_capability_minor}; 126 127 auto ptx_iter = ptx_by_compute_capability_.find(capability); 128 if (ptx_iter == ptx_by_compute_capability_.end()) { 129 return nullptr; 130 } 131 132 mutex_lock lock{mu_}; 133 134 // Check if there is an entry in decompressed ptx table. 135 auto decompressed_ptx_iter = decompressed_ptx_.find(ptx_iter->second); 136 if (decompressed_ptx_iter != decompressed_ptx_.end()) { 137 // If the decompressed string is empty, which means the ptx hasn't been 138 // decompressed, decompress it here. 139 if (decompressed_ptx_iter->second.empty()) { 140 decompressed_ptx_iter->second = DecompressPtx(ptx_iter->second); 141 } 142 return decompressed_ptx_iter->second.c_str(); 143 } 144 return ptx_iter->second; 145 } 146 147 const char *CudaPtxInMemory::original_text(int compute_capability_major, 148 int compute_capability_minor) const { 149 std::tuple<int, int> capability{compute_capability_major, 150 compute_capability_minor}; 151 152 auto ptx_iter = ptx_by_compute_capability_.find(capability); 153 if (ptx_iter == ptx_by_compute_capability_.end()) { 154 return nullptr; 155 } 156 157 return ptx_iter->second; 158 } 159 160 OpenCLTextOnDisk::OpenCLTextOnDisk(port::StringPiece filename, 161 port::StringPiece kernelname) 162 : OnDiskKernelLoaderSpec(filename, kernelname) {} 163 164 OpenCLTextInMemory::OpenCLTextInMemory(port::StringPiece text, 165 port::StringPiece kernelname) 166 : KernelLoaderSpec(kernelname), text_(text.ToString()) {} 167 168 OpenCLBinaryOnDisk::OpenCLBinaryOnDisk(port::StringPiece filename, 169 port::StringPiece kernelname) 170 : OnDiskKernelLoaderSpec(filename, kernelname) {} 171 172 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLTextOnDisk( 173 port::StringPiece filename, port::StringPiece kernelname) { 174 CHECK(ocl_text_on_disk_ == nullptr); 175 ocl_text_on_disk_.reset(new OpenCLTextOnDisk{filename, kernelname}); 176 return this; 177 } 178 179 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLBinaryOnDisk( 180 port::StringPiece filename, port::StringPiece kernelname) { 181 CHECK(ocl_binary_on_disk_ == nullptr); 182 ocl_binary_on_disk_.reset(new OpenCLBinaryOnDisk{filename, kernelname}); 183 return this; 184 } 185 186 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLTextInMemory( 187 port::StringPiece filename, port::StringPiece kernelname) { 188 CHECK(ocl_text_in_memory_ == nullptr); 189 ocl_text_in_memory_.reset(new OpenCLTextInMemory{filename, kernelname}); 190 return this; 191 } 192 193 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxOnDisk( 194 port::StringPiece filename, port::StringPiece kernelname) { 195 CHECK(cuda_ptx_on_disk_ == nullptr); 196 cuda_ptx_on_disk_.reset(new CudaPtxOnDisk{filename, kernelname}); 197 return this; 198 } 199 200 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCubinInMemory( 201 const char *bytes, port::StringPiece kernelname) { 202 CHECK(cuda_cubin_in_memory_ == nullptr); 203 cuda_cubin_in_memory_.reset(new CudaCubinInMemory{bytes, kernelname}); 204 return this; 205 } 206 207 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCubinOnDisk( 208 port::StringPiece filename, port::StringPiece kernelname) { 209 CHECK(cuda_cubin_on_disk_ == nullptr); 210 cuda_cubin_on_disk_.reset(new CudaCubinOnDisk{filename, kernelname}); 211 return this; 212 } 213 214 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxInMemory( 215 port::StringPiece ptx, port::StringPiece kernelname) { 216 CHECK(cuda_ptx_in_memory_ == nullptr); 217 cuda_ptx_in_memory_.reset( 218 new CudaPtxInMemory{ptx, kernelname, false /* ptx_compressed */}); 219 return this; 220 } 221 222 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCompressedPtxInMemory( 223 port::StringPiece ptx, port::StringPiece kernelname) { 224 CHECK(cuda_ptx_in_memory_ == nullptr); 225 cuda_ptx_in_memory_.reset( 226 new CudaPtxInMemory{ptx, kernelname, true /* ptx_compressed */}); 227 return this; 228 } 229 230 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxInMemory( 231 std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list, 232 port::StringPiece kernelname) { 233 CHECK(cuda_ptx_in_memory_ == nullptr); 234 cuda_ptx_in_memory_.reset( 235 new CudaPtxInMemory{spec_list, kernelname, false /* ptx_compressed */}); 236 return this; 237 } 238 239 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCompressedPtxInMemory( 240 std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list, 241 port::StringPiece kernelname) { 242 CHECK(cuda_ptx_in_memory_ == nullptr); 243 cuda_ptx_in_memory_.reset( 244 new CudaPtxInMemory{spec_list, kernelname, true /* ptx_compressed */}); 245 return this; 246 } 247 248 MultiKernelLoaderSpec::MultiKernelLoaderSpec(size_t arity) : arity_(arity) {} 249 250 } // namespace gputools 251 } // namespace perftools 252