1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // The CUDA implementation of the StreamExecutorInterface functionality. 17 // CUDA inclusions are ideally confined to this implementation file. 18 // 19 // The notions from the StreamExecutor basically correspond to the CUDA streams 20 // programming model provided by the libcuda.so driver APIs, so we don't have 21 // to do much more than wrap the calls to the libraries appropriately. 22 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_ 23 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_ 24 25 #include <set> 26 #include <unordered_map> 27 28 #include "tensorflow/stream_executor/cuda/cuda_kernel.h" 29 #include "tensorflow/stream_executor/event.h" 30 #include "tensorflow/stream_executor/lib/status.h" 31 #include "tensorflow/stream_executor/lib/statusor.h" 32 #include "tensorflow/stream_executor/platform.h" 33 #include "tensorflow/stream_executor/platform/mutex.h" 34 #include "tensorflow/stream_executor/platform/port.h" 35 #include "tensorflow/stream_executor/platform/thread_annotations.h" 36 #include "tensorflow/stream_executor/stream_executor_internal.h" 37 38 namespace perftools { 39 namespace gputools { 40 namespace cuda { 41 42 // CUDA-platform implementation of the platform-agnostic 43 // StreamExecutorInferface. 44 class CUDAExecutor : public internal::StreamExecutorInterface { 45 public: 46 // sub_platform indicates the subplatform used in this executor; it must 47 // be a CUDA type. 48 explicit CUDAExecutor(const PluginConfig &plugin_config) 49 : device_(0), 50 context_(nullptr), 51 device_ordinal_(0), 52 cc_major_(0), 53 cc_minor_(0), 54 plugin_config_(plugin_config) {} 55 56 // See the corresponding StreamExecutor methods for method comments on the 57 // following overrides. 58 59 ~CUDAExecutor() override; 60 61 port::Status Init(int device_ordinal, DeviceOptions device_options) override; 62 63 bool GetKernel(const MultiKernelLoaderSpec &spec, 64 KernelBase *kernel) override; 65 void UnloadKernel(const KernelBase *kernel) override; 66 67 bool Launch(Stream *stream, const ThreadDim &thread_dims, 68 const BlockDim &block_dims, const KernelBase &k, 69 const KernelArgsArrayBase &args) override; 70 71 void *Allocate(uint64 size) override; 72 73 void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes, 74 uint64 size_bytes) override; 75 76 void Deallocate(DeviceMemoryBase *mem) override; 77 78 // CUDA allocation/registration functions are necessary because the driver 79 // internally sets up buffers for DMA operations (and page locks them). 80 // There's no external interface for us to otherwise control these DMA 81 // settings. 82 void *HostMemoryAllocate(uint64 size) override { 83 return CUDADriver::HostAllocate(context_, size); 84 } 85 86 void HostMemoryDeallocate(void *location) override { 87 return CUDADriver::HostDeallocate(context_, location); 88 } 89 90 bool HostMemoryRegister(void *location, uint64 size) override; 91 92 bool HostMemoryUnregister(void *location) override; 93 94 bool SynchronizeAllActivity() override; 95 96 bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override; 97 98 bool SynchronousMemSet(DeviceMemoryBase *location, int value, 99 uint64 size) override; 100 101 port::Status SynchronousMemcpy(DeviceMemoryBase *gpu_dst, 102 const void *host_src, uint64 size) override; 103 104 port::Status SynchronousMemcpy(void *host_dst, 105 const DeviceMemoryBase &gpu_src, 106 uint64 size) override; 107 108 port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *gpu_dst, 109 const DeviceMemoryBase &gpu_src, 110 uint64 size) override; 111 112 bool MemZero(Stream *stream, DeviceMemoryBase *location, 113 uint64 size) override; 114 bool Memset(Stream *stream, DeviceMemoryBase *location, uint8 pattern, 115 uint64 size) override; 116 bool Memset32(Stream *stream, DeviceMemoryBase *location, uint32 pattern, 117 uint64 size) override; 118 119 bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &gpu_src, 120 uint64 size) override; 121 122 bool Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst, const void *host_src, 123 uint64 size) override; 124 125 bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst, 126 const DeviceMemoryBase &gpu_src, 127 uint64 size) override; 128 129 bool HostCallback(Stream *stream, std::function<void()> callback) override; 130 131 bool AllocateStream(Stream *stream) override; 132 133 void DeallocateStream(Stream *stream) override; 134 135 bool CreateStreamDependency(Stream *dependent, Stream *other) override; 136 137 bool AllocateTimer(Timer *timer) override; 138 139 void DeallocateTimer(Timer *timer) override; 140 141 bool StartTimer(Stream *stream, Timer *timer) override; 142 143 bool StopTimer(Stream *stream, Timer *timer) override; 144 145 port::Status AllocateEvent(Event *event) override; 146 147 port::Status DeallocateEvent(Event *event) override; 148 149 port::Status RecordEvent(Stream *stream, Event *event) override; 150 151 port::Status WaitForEvent(Stream *stream, Event *event) override; 152 153 Event::Status PollForEventStatus(Event *event) override; 154 155 port::Status BlockHostUntilDone(Stream *stream) override; 156 157 int PlatformDeviceCount() override { return CUDADriver::GetDeviceCount(); } 158 159 port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override; 160 161 bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override; 162 163 SharedMemoryConfig GetDeviceSharedMemoryConfig() override; 164 165 port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override; 166 167 bool DeviceMemoryUsage(int64 *free, int64 *total) const override; 168 169 // Search for the symbol and returns a device pointer and size. 170 // Returns false if symbol does not exist. 171 bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes) override; 172 173 DeviceDescription *PopulateDeviceDescription() const override; 174 175 // Populates the block_dim_limit by querying the device driver API. If an 176 // error occurs at any point while asking the driver for block dim limits, it 177 // will be only partially populated as a result, and an error will be logged. 178 bool FillBlockDimLimit(BlockDim *block_dim_limit) const; 179 180 bool SupportsBlas() const override; 181 182 blas::BlasSupport *CreateBlas() override; 183 184 bool SupportsFft() const override; 185 186 fft::FftSupport *CreateFft() override; 187 188 bool SupportsRng() const override; 189 190 rng::RngSupport *CreateRng() override; 191 192 bool SupportsDnn() const override; 193 194 dnn::DnnSupport *CreateDnn() override; 195 196 std::unique_ptr<internal::EventInterface> CreateEventImplementation() 197 override; 198 199 std::unique_ptr<internal::KernelInterface> CreateKernelImplementation() 200 override; 201 202 std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override; 203 204 std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override; 205 206 void *CudaContextHack() override; 207 208 CudaContext* cuda_context(); 209 210 private: 211 // Attempts to find a more specific version of the file indicated by 212 // filename by looking for compute-capability-specific suffixed versions; i.e. 213 // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if 214 // we're on a compute capability 3.0 machine. 215 bool FindOnDiskForComputeCapability(port::StringPiece filename, 216 port::StringPiece canonical_suffix, 217 string *found_filename) const; 218 219 // Host callback landing routine invoked by CUDA. 220 // data: User-provided callback provided to HostCallback() above, captured 221 // as a std::function<void()>. Allocated/initialized inside 222 // HostCallback() and owned and deleted by this call. 223 static void InternalHostCallback(CUstream stream, CUresult status, 224 void *data); 225 226 // Collects metadata for the specified kernel. 227 bool GetKernelMetadata(CUDAKernel *cuda_kernel, 228 KernelMetadata *kernel_metadata); 229 230 // Prints to VLOG(2) information about the kernel's occupancy and how it might 231 // be improved. 232 void VlogOccupancyInfo(const KernelBase &kernel, const ThreadDim &thread_dims, 233 const BlockDim &block_dims); 234 235 // Guards the in-memory-module mapping. 236 mutex in_memory_modules_mu_; 237 238 // Kernel -> loaded GPU binary. Many kernels may load the same binary. 239 std::unordered_map<const KernelBase *, const void *> kernel_to_gpu_binary_ 240 GUARDED_BY(in_memory_modules_mu_); 241 // GPU binary (PTX or CUBIN) -> {CUDA module, reference count}. 242 std::unordered_map<const void *, std::pair<CUmodule, uint64>> 243 gpu_binary_to_module_ GUARDED_BY(in_memory_modules_mu_); 244 245 // Guards the launched kernel set. 246 mutex launched_kernels_mu_; 247 248 // Keeps track of the set of launched kernels. Currently used to suppress the 249 // occupancy check on subsequent launches. 250 std::set<CUfunction> launched_kernels_ GUARDED_BY(launched_kernels_mu_); 251 252 // Handle for the CUDA device being operated on. Immutable 253 // post-initialization. 254 CUdevice device_; 255 256 // Handle for session with the library/driver. Immutable post-initialization. 257 CudaContext* context_; 258 259 // The device ordinal value that this executor was initialized with; recorded 260 // for use in getting device metadata. Immutable post-initialization. 261 int device_ordinal_; 262 263 // The major verion of the compute capability for device_. 264 int cc_major_; 265 266 // The minor verion of the compute capability for device_. 267 int cc_minor_; 268 269 // The plugin configuration associated with this instance. 270 PluginConfig plugin_config_; 271 272 SE_DISALLOW_COPY_AND_ASSIGN(CUDAExecutor); 273 }; 274 275 } // namespace cuda 276 } // namespace gputools 277 } // namespace perftools 278 279 #endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_ 280