Home | History | Annotate | Download | only in gpu
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #if !GOOGLE_CUDA
     17 #error This file must only be included when building with Cuda support
     18 #endif
     19 
     20 #ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
     21 #define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
     22 
     23 #include <memory>
     24 #include <string>
     25 #include <unordered_map>
     26 #include <vector>
     27 
     28 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
     29 #include "tensorflow/core/common_runtime/device_factory.h"
     30 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
     31 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
     32 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
     33 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
     34 #include "tensorflow/core/common_runtime/gpu_device_context.h"
     35 #include "tensorflow/core/common_runtime/local_device.h"
     36 #include "tensorflow/core/framework/allocator.h"
     37 #include "tensorflow/core/framework/device_base.h"
     38 #include "tensorflow/core/framework/op_kernel.h"
     39 #include "tensorflow/core/framework/tensor.h"
     40 #include "tensorflow/core/lib/core/status.h"
     41 #include "tensorflow/core/platform/mutex.h"
     42 #include "tensorflow/core/platform/stream_executor.h"
     43 #include "tensorflow/core/platform/types.h"
     44 #include "tensorflow/core/public/session_options.h"
     45 
     46 namespace tensorflow {
     47 
     48 class BaseGPUDevice : public LocalDevice {
     49  public:
     50   BaseGPUDevice(const SessionOptions& options, const string& name,
     51                 Bytes memory_limit, const DeviceLocality& locality,
     52                 TfGpuId tf_gpu_id, const string& physical_device_desc,
     53                 Allocator* gpu_allocator, Allocator* cpu_allocator,
     54                 bool sync_every_op, int32 max_streams);
     55 
     56   ~BaseGPUDevice() override;
     57 
     58   // Initialize the device and return the status of initialization.
     59   Status Init(const SessionOptions& options);
     60 
     61   // GPU devices require the Op Compute method to save a reference to
     62   // any temporary tensors that are allocated until the Op execution
     63   // completes.
     64   bool RequiresRecordingAccessedTensors() const override;
     65 
     66   void ConsumeListOfAccessedTensors(
     67       DeviceContext* device_context,
     68       const TensorReferenceVector& tensor_refs) override;
     69 
     70   Status FillContextMap(const Graph* graph,
     71                         DeviceContextMap* device_context_map);
     72 
     73   void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
     74 
     75   Status Sync() override;
     76 
     77   void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
     78                     AsyncOpKernel::DoneCallback done) override;
     79 
     80   Status MakeTensorFromProto(const TensorProto& tensor_proto,
     81                              const AllocatorAttributes alloc_attrs,
     82                              Tensor* tensor) override;
     83 
     84   // The caller owns the returned device.
     85   PerOpGpuDevice* MakeGpuDevice() override;
     86 
     87   void ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
     88                              DeviceContext* dc, Allocator* allocator) override;
     89 
     90   // Returns the CUDA GPU id of this device within the native driver system;
     91   // e.g., for CUDA this is the ordinal of the GPU within the system.
     92   int gpu_id() const { return GpuIdManager::TfToCudaGpuId(tf_gpu_id_).value(); }
     93 
     94   // The executor that provides control for the device; e.g., for CUDA this
     95   // corresponds to the cuda context.
     96   gpu::StreamExecutor* executor() const { return executor_; }
     97 
     98  protected:
     99   Allocator* gpu_allocator_;  // not owned
    100   Allocator* cpu_allocator_;  // not owned
    101 
    102   gpu::StreamExecutor* executor_;  // not owned
    103 
    104  private:
    105   struct StreamGroup {
    106     gpu::Stream* compute = nullptr;
    107     gpu::Stream* host_to_device = nullptr;
    108     gpu::Stream* device_to_host = nullptr;
    109     gpu::Stream* device_to_device = nullptr;
    110   };
    111   class StreamGroupFactory;
    112 
    113   gtl::InlinedVector<StreamGroup*, 4> streams_;
    114   gtl::InlinedVector<char*, 4> scratch_;
    115   std::vector<GPUDeviceContext*> device_contexts_;
    116   GpuDeviceInfo* gpu_device_info_ = nullptr;
    117   mutex trace_mu_;
    118   TfGpuId tf_gpu_id_;
    119   const bool sync_every_op_ = false;
    120   const int32 max_streams_;
    121   std::unique_ptr<EventMgr> em_;
    122   std::unique_ptr<thread::ThreadPool> thread_pool_;
    123 
    124   void ReinitializeDevice(OpKernelContext* context, PerOpGpuDevice* device,
    125                           int stream_id, Allocator* allocator);
    126 
    127   void ComputeHelper(OpKernel* op_kernel, OpKernelContext* context);
    128 
    129   // This method returns an initialization status, in addition to
    130   // calling the "done" StatusCallback, if there is a failure to
    131   // allocate memory or if the tensor "from" is not DMA-copyable.
    132   // If there is no error prior to enqueueing the copy, an OK status
    133   // is returned.
    134   Status MaybeCopyTensorToGPU(const AllocatorAttributes& alloc_attrs,
    135                               const Tensor& from, Tensor* to,
    136                               StatusCallback done);
    137 };
    138 
    139 class BaseGPUDeviceFactory : public DeviceFactory {
    140  public:
    141   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
    142                        std::vector<Device*>* devices) override;
    143 
    144   struct InterconnectMap {
    145     // Name of interconnect technology, if known.
    146     string name;
    147     // If possible, strength should approximate Gb/sec bandwidth rate.
    148     // Where architecture-specific subclassing is not done that won't
    149     // always be possible.  The minimum expectation is that
    150     // faster links should have a higher value than slower links.
    151     int32 strength;
    152     static const int kSameDeviceStrength;
    153     static const int kStreamExecutorStrength;
    154     std::set<std::pair<CudaGpuId, CudaGpuId>> directed_links;
    155   };
    156 
    157  protected:
    158   // Populates *maps with interconnect maps for all local direct access
    159   // pathways between GPUs.
    160   virtual Status GetInterconnectMaps(
    161       const std::vector<CudaGpuId>& visible_gpu_order,
    162       gpu::Platform* gpu_manager, std::vector<InterconnectMap>* maps);
    163 
    164   struct TfGpuIdHash {
    165     std::size_t operator()(const TfGpuId& id) const noexcept {
    166       return std::hash<int>{}(id.value());
    167     }
    168   };
    169   typedef std::unordered_map<TfGpuId, DeviceLocality, TfGpuIdHash> LocalityMap;
    170   // Populates *localities with the DeviceLocality descriptor for
    171   // every TfGpuId.
    172   virtual Status GetDeviceLocalities(
    173       int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
    174       LocalityMap* localities);
    175 
    176  private:
    177   // Creates a BaseGPUDevice associated with 'tf_gpu_id', allocates (strictly)
    178   // 'memory_limit' bytes of GPU memory to it, and adds it to the 'devices'
    179   // vector.
    180   Status CreateGPUDevice(const SessionOptions& options,
    181                          const string& name_prefix, TfGpuId tf_gpu_id,
    182                          int64 memory_limit, const DeviceLocality& dev_locality,
    183                          std::vector<Device*>* devices);
    184 
    185   virtual BaseGPUDevice* CreateGPUDevice(const SessionOptions& options,
    186                                          const string& name, Bytes memory_limit,
    187                                          const DeviceLocality& dev_locality,
    188                                          TfGpuId tf_gpu_id,
    189                                          const string& physical_device_desc,
    190                                          Allocator* gpu_allocator,
    191                                          Allocator* cpu_allocator) = 0;
    192 
    193   // Returns into 'ids' the list of valid CUDA GPU ids, in the order that
    194   // they should map to TF GPU ids "/device:GPU:0", "/device:GPU:1", etc,
    195   // based upon 'visible_gpu_order' which was generated by parsing
    196   // GPUOptions::visible_device_list which is a comma-separated list of CUDA GPU
    197   // ids.
    198   Status GetValidDeviceIds(const std::vector<CudaGpuId>& visible_gpu_order,
    199                            std::vector<CudaGpuId>* ids);
    200 
    201   // visible_gpu_initialized_[cuda_gpu_id] is true if visible GPU cuda_gpu_id
    202   // has been initialized by the process.
    203   std::unordered_map<int, bool> visible_gpu_initialized_;
    204 };
    205 
    206 }  // namespace tensorflow
    207 
    208 #endif  // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
    209