1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #if !GOOGLE_CUDA 17 #error This file must only be included when building with Cuda support 18 #endif 19 20 #ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_ 21 #define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_ 22 23 #include <memory> 24 #include <string> 25 #include <unordered_map> 26 #include <vector> 27 28 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 29 #include "tensorflow/core/common_runtime/device_factory.h" 30 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" 31 #include "tensorflow/core/common_runtime/gpu/gpu_id.h" 32 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h" 33 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h" 34 #include "tensorflow/core/common_runtime/gpu_device_context.h" 35 #include "tensorflow/core/common_runtime/local_device.h" 36 #include "tensorflow/core/framework/allocator.h" 37 #include "tensorflow/core/framework/device_base.h" 38 #include "tensorflow/core/framework/op_kernel.h" 39 #include "tensorflow/core/framework/tensor.h" 40 #include "tensorflow/core/lib/core/status.h" 41 #include "tensorflow/core/platform/mutex.h" 42 #include "tensorflow/core/platform/stream_executor.h" 43 #include "tensorflow/core/platform/types.h" 44 #include "tensorflow/core/public/session_options.h" 45 46 namespace tensorflow { 47 48 class BaseGPUDevice : public LocalDevice { 49 public: 50 BaseGPUDevice(const SessionOptions& options, const string& name, 51 Bytes memory_limit, const DeviceLocality& locality, 52 TfGpuId tf_gpu_id, const string& physical_device_desc, 53 Allocator* gpu_allocator, Allocator* cpu_allocator, 54 bool sync_every_op, int32 max_streams); 55 56 ~BaseGPUDevice() override; 57 58 // Initialize the device and return the status of initialization. 59 Status Init(const SessionOptions& options); 60 61 // GPU devices require the Op Compute method to save a reference to 62 // any temporary tensors that are allocated until the Op execution 63 // completes. 64 bool RequiresRecordingAccessedTensors() const override; 65 66 void ConsumeListOfAccessedTensors( 67 DeviceContext* device_context, 68 const TensorReferenceVector& tensor_refs) override; 69 70 Status FillContextMap(const Graph* graph, 71 DeviceContextMap* device_context_map); 72 73 void Compute(OpKernel* op_kernel, OpKernelContext* context) override; 74 75 Status Sync() override; 76 77 void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context, 78 AsyncOpKernel::DoneCallback done) override; 79 80 Status MakeTensorFromProto(const TensorProto& tensor_proto, 81 const AllocatorAttributes alloc_attrs, 82 Tensor* tensor) override; 83 84 // The caller owns the returned device. 85 PerOpGpuDevice* MakeGpuDevice() override; 86 87 void ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device, 88 DeviceContext* dc, Allocator* allocator) override; 89 90 // Returns the CUDA GPU id of this device within the native driver system; 91 // e.g., for CUDA this is the ordinal of the GPU within the system. 92 int gpu_id() const { return GpuIdManager::TfToCudaGpuId(tf_gpu_id_).value(); } 93 94 // The executor that provides control for the device; e.g., for CUDA this 95 // corresponds to the cuda context. 96 gpu::StreamExecutor* executor() const { return executor_; } 97 98 protected: 99 Allocator* gpu_allocator_; // not owned 100 Allocator* cpu_allocator_; // not owned 101 102 gpu::StreamExecutor* executor_; // not owned 103 104 private: 105 struct StreamGroup { 106 gpu::Stream* compute = nullptr; 107 gpu::Stream* host_to_device = nullptr; 108 gpu::Stream* device_to_host = nullptr; 109 gpu::Stream* device_to_device = nullptr; 110 }; 111 class StreamGroupFactory; 112 113 gtl::InlinedVector<StreamGroup*, 4> streams_; 114 gtl::InlinedVector<char*, 4> scratch_; 115 std::vector<GPUDeviceContext*> device_contexts_; 116 GpuDeviceInfo* gpu_device_info_ = nullptr; 117 mutex trace_mu_; 118 TfGpuId tf_gpu_id_; 119 const bool sync_every_op_ = false; 120 const int32 max_streams_; 121 std::unique_ptr<EventMgr> em_; 122 std::unique_ptr<thread::ThreadPool> thread_pool_; 123 124 void ReinitializeDevice(OpKernelContext* context, PerOpGpuDevice* device, 125 int stream_id, Allocator* allocator); 126 127 void ComputeHelper(OpKernel* op_kernel, OpKernelContext* context); 128 129 // This method returns an initialization status, in addition to 130 // calling the "done" StatusCallback, if there is a failure to 131 // allocate memory or if the tensor "from" is not DMA-copyable. 132 // If there is no error prior to enqueueing the copy, an OK status 133 // is returned. 134 Status MaybeCopyTensorToGPU(const AllocatorAttributes& alloc_attrs, 135 const Tensor& from, Tensor* to, 136 StatusCallback done); 137 }; 138 139 class BaseGPUDeviceFactory : public DeviceFactory { 140 public: 141 Status CreateDevices(const SessionOptions& options, const string& name_prefix, 142 std::vector<Device*>* devices) override; 143 144 struct InterconnectMap { 145 // Name of interconnect technology, if known. 146 string name; 147 // If possible, strength should approximate Gb/sec bandwidth rate. 148 // Where architecture-specific subclassing is not done that won't 149 // always be possible. The minimum expectation is that 150 // faster links should have a higher value than slower links. 151 int32 strength; 152 static const int kSameDeviceStrength; 153 static const int kStreamExecutorStrength; 154 std::set<std::pair<CudaGpuId, CudaGpuId>> directed_links; 155 }; 156 157 protected: 158 // Populates *maps with interconnect maps for all local direct access 159 // pathways between GPUs. 160 virtual Status GetInterconnectMaps( 161 const std::vector<CudaGpuId>& visible_gpu_order, 162 gpu::Platform* gpu_manager, std::vector<InterconnectMap>* maps); 163 164 struct TfGpuIdHash { 165 std::size_t operator()(const TfGpuId& id) const noexcept { 166 return std::hash<int>{}(id.value()); 167 } 168 }; 169 typedef std::unordered_map<TfGpuId, DeviceLocality, TfGpuIdHash> LocalityMap; 170 // Populates *localities with the DeviceLocality descriptor for 171 // every TfGpuId. 172 virtual Status GetDeviceLocalities( 173 int num_tf_gpus, const std::vector<InterconnectMap>& interconnects, 174 LocalityMap* localities); 175 176 private: 177 // Creates a BaseGPUDevice associated with 'tf_gpu_id', allocates (strictly) 178 // 'memory_limit' bytes of GPU memory to it, and adds it to the 'devices' 179 // vector. 180 Status CreateGPUDevice(const SessionOptions& options, 181 const string& name_prefix, TfGpuId tf_gpu_id, 182 int64 memory_limit, const DeviceLocality& dev_locality, 183 std::vector<Device*>* devices); 184 185 virtual BaseGPUDevice* CreateGPUDevice(const SessionOptions& options, 186 const string& name, Bytes memory_limit, 187 const DeviceLocality& dev_locality, 188 TfGpuId tf_gpu_id, 189 const string& physical_device_desc, 190 Allocator* gpu_allocator, 191 Allocator* cpu_allocator) = 0; 192 193 // Returns into 'ids' the list of valid CUDA GPU ids, in the order that 194 // they should map to TF GPU ids "/device:GPU:0", "/device:GPU:1", etc, 195 // based upon 'visible_gpu_order' which was generated by parsing 196 // GPUOptions::visible_device_list which is a comma-separated list of CUDA GPU 197 // ids. 198 Status GetValidDeviceIds(const std::vector<CudaGpuId>& visible_gpu_order, 199 std::vector<CudaGpuId>* ids); 200 201 // visible_gpu_initialized_[cuda_gpu_id] is true if visible GPU cuda_gpu_id 202 // has been initialized by the process. 203 std::unordered_map<int, bool> visible_gpu_initialized_; 204 }; 205 206 } // namespace tensorflow 207 208 #endif // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_ 209