Home | History | Annotate | Download | only in cuda
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // The CUDA implementation of the StreamExecutorInterface functionality.
     17 // CUDA inclusions are ideally confined to this implementation file.
     18 //
     19 // The notions from the StreamExecutor basically correspond to the CUDA streams
     20 // programming model provided by the libcuda.so driver APIs, so we don't have
     21 // to do much more than wrap the calls to the libraries appropriately.
     22 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
     23 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
     24 
     25 #include <set>
     26 #include <unordered_map>
     27 
     28 #include "tensorflow/stream_executor/cuda/cuda_kernel.h"
     29 #include "tensorflow/stream_executor/event.h"
     30 #include "tensorflow/stream_executor/lib/status.h"
     31 #include "tensorflow/stream_executor/lib/statusor.h"
     32 #include "tensorflow/stream_executor/platform.h"
     33 #include "tensorflow/stream_executor/platform/mutex.h"
     34 #include "tensorflow/stream_executor/platform/port.h"
     35 #include "tensorflow/stream_executor/platform/thread_annotations.h"
     36 #include "tensorflow/stream_executor/stream_executor_internal.h"
     37 
     38 namespace perftools {
     39 namespace gputools {
     40 namespace cuda {
     41 
     42 // CUDA-platform implementation of the platform-agnostic
     43 // StreamExecutorInferface.
     44 class CUDAExecutor : public internal::StreamExecutorInterface {
     45  public:
     46   // sub_platform indicates the subplatform used in this executor; it must
     47   // be a CUDA type.
     48   explicit CUDAExecutor(const PluginConfig &plugin_config)
     49       : device_(0),
     50         context_(nullptr),
     51         device_ordinal_(0),
     52         cc_major_(0),
     53         cc_minor_(0),
     54         plugin_config_(plugin_config) {}
     55 
     56   // See the corresponding StreamExecutor methods for method comments on the
     57   // following overrides.
     58 
     59   ~CUDAExecutor() override;
     60 
     61   port::Status Init(int device_ordinal, DeviceOptions device_options) override;
     62 
     63   bool GetKernel(const MultiKernelLoaderSpec &spec,
     64                  KernelBase *kernel) override;
     65   void UnloadKernel(const KernelBase *kernel) override;
     66 
     67   bool Launch(Stream *stream, const ThreadDim &thread_dims,
     68               const BlockDim &block_dims, const KernelBase &k,
     69               const KernelArgsArrayBase &args) override;
     70 
     71   void *Allocate(uint64 size) override;
     72 
     73   void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,
     74                           uint64 size_bytes) override;
     75 
     76   void Deallocate(DeviceMemoryBase *mem) override;
     77 
     78   // CUDA allocation/registration functions are necessary because the driver
     79   // internally sets up buffers for DMA operations (and page locks them).
     80   // There's no external interface for us to otherwise control these DMA
     81   // settings.
     82   void *HostMemoryAllocate(uint64 size) override {
     83     return CUDADriver::HostAllocate(context_, size);
     84   }
     85 
     86   void HostMemoryDeallocate(void *location) override {
     87     return CUDADriver::HostDeallocate(context_, location);
     88   }
     89 
     90   bool HostMemoryRegister(void *location, uint64 size) override;
     91 
     92   bool HostMemoryUnregister(void *location) override;
     93 
     94   bool SynchronizeAllActivity() override;
     95 
     96   bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override;
     97 
     98   bool SynchronousMemSet(DeviceMemoryBase *location, int value,
     99                          uint64 size) override;
    100 
    101   port::Status SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
    102                                  const void *host_src, uint64 size) override;
    103 
    104   port::Status SynchronousMemcpy(void *host_dst,
    105                                  const DeviceMemoryBase &gpu_src,
    106                                  uint64 size) override;
    107 
    108   port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *gpu_dst,
    109                                                const DeviceMemoryBase &gpu_src,
    110                                                uint64 size) override;
    111 
    112   bool MemZero(Stream *stream, DeviceMemoryBase *location,
    113                uint64 size) override;
    114   bool Memset(Stream *stream, DeviceMemoryBase *location, uint8 pattern,
    115               uint64 size) override;
    116   bool Memset32(Stream *stream, DeviceMemoryBase *location, uint32 pattern,
    117                 uint64 size) override;
    118 
    119   bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &gpu_src,
    120               uint64 size) override;
    121 
    122   bool Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst, const void *host_src,
    123               uint64 size) override;
    124 
    125   bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst,
    126                             const DeviceMemoryBase &gpu_src,
    127                             uint64 size) override;
    128 
    129   bool HostCallback(Stream *stream, std::function<void()> callback) override;
    130 
    131   bool AllocateStream(Stream *stream) override;
    132 
    133   void DeallocateStream(Stream *stream) override;
    134 
    135   bool CreateStreamDependency(Stream *dependent, Stream *other) override;
    136 
    137   bool AllocateTimer(Timer *timer) override;
    138 
    139   void DeallocateTimer(Timer *timer) override;
    140 
    141   bool StartTimer(Stream *stream, Timer *timer) override;
    142 
    143   bool StopTimer(Stream *stream, Timer *timer) override;
    144 
    145   port::Status AllocateEvent(Event *event) override;
    146 
    147   port::Status DeallocateEvent(Event *event) override;
    148 
    149   port::Status RecordEvent(Stream *stream, Event *event) override;
    150 
    151   port::Status WaitForEvent(Stream *stream, Event *event) override;
    152 
    153   Event::Status PollForEventStatus(Event *event) override;
    154 
    155   port::Status BlockHostUntilDone(Stream *stream) override;
    156 
    157   int PlatformDeviceCount() override { return CUDADriver::GetDeviceCount(); }
    158 
    159   port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override;
    160 
    161   bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override;
    162 
    163   SharedMemoryConfig GetDeviceSharedMemoryConfig() override;
    164 
    165   port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override;
    166 
    167   bool DeviceMemoryUsage(int64 *free, int64 *total) const override;
    168 
    169   // Search for the symbol and returns a device pointer and size.
    170   // Returns false if symbol does not exist.
    171   bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes) override;
    172 
    173   DeviceDescription *PopulateDeviceDescription() const override;
    174 
    175   // Populates the block_dim_limit by querying the device driver API. If an
    176   // error occurs at any point while asking the driver for block dim limits, it
    177   // will be only partially populated as a result, and an error will be logged.
    178   bool FillBlockDimLimit(BlockDim *block_dim_limit) const;
    179 
    180   bool SupportsBlas() const override;
    181 
    182   blas::BlasSupport *CreateBlas() override;
    183 
    184   bool SupportsFft() const override;
    185 
    186   fft::FftSupport *CreateFft() override;
    187 
    188   bool SupportsRng() const override;
    189 
    190   rng::RngSupport *CreateRng() override;
    191 
    192   bool SupportsDnn() const override;
    193 
    194   dnn::DnnSupport *CreateDnn() override;
    195 
    196   std::unique_ptr<internal::EventInterface> CreateEventImplementation()
    197       override;
    198 
    199   std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
    200       override;
    201 
    202   std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
    203 
    204   std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override;
    205 
    206   void *CudaContextHack() override;
    207 
    208   CudaContext* cuda_context();
    209 
    210  private:
    211   // Attempts to find a more specific version of the file indicated by
    212   // filename by looking for compute-capability-specific suffixed versions; i.e.
    213   // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
    214   // we're on a compute capability 3.0 machine.
    215   bool FindOnDiskForComputeCapability(port::StringPiece filename,
    216                                       port::StringPiece canonical_suffix,
    217                                       string *found_filename) const;
    218 
    219   // Host callback landing routine invoked by CUDA.
    220   // data: User-provided callback provided to HostCallback() above, captured
    221   //       as a std::function<void()>. Allocated/initialized inside
    222   //       HostCallback() and owned and deleted by this call.
    223   static void InternalHostCallback(CUstream stream, CUresult status,
    224                                    void *data);
    225 
    226   // Collects metadata for the specified kernel.
    227   bool GetKernelMetadata(CUDAKernel *cuda_kernel,
    228                          KernelMetadata *kernel_metadata);
    229 
    230   // Prints to VLOG(2) information about the kernel's occupancy and how it might
    231   // be improved.
    232   void VlogOccupancyInfo(const KernelBase &kernel, const ThreadDim &thread_dims,
    233                          const BlockDim &block_dims);
    234 
    235   // Guards the in-memory-module mapping.
    236   mutex in_memory_modules_mu_;
    237 
    238   // Kernel -> loaded GPU binary. Many kernels may load the same binary.
    239   std::unordered_map<const KernelBase *, const void *> kernel_to_gpu_binary_
    240       GUARDED_BY(in_memory_modules_mu_);
    241   // GPU binary (PTX or CUBIN) -> {CUDA module, reference count}.
    242   std::unordered_map<const void *, std::pair<CUmodule, uint64>>
    243       gpu_binary_to_module_ GUARDED_BY(in_memory_modules_mu_);
    244 
    245   // Guards the launched kernel set.
    246   mutex launched_kernels_mu_;
    247 
    248   // Keeps track of the set of launched kernels. Currently used to suppress the
    249   // occupancy check on subsequent launches.
    250   std::set<CUfunction> launched_kernels_ GUARDED_BY(launched_kernels_mu_);
    251 
    252   // Handle for the CUDA device being operated on. Immutable
    253   // post-initialization.
    254   CUdevice device_;
    255 
    256   // Handle for session with the library/driver. Immutable post-initialization.
    257   CudaContext* context_;
    258 
    259   // The device ordinal value that this executor was initialized with; recorded
    260   // for use in getting device metadata. Immutable post-initialization.
    261   int device_ordinal_;
    262 
    263   // The major verion of the compute capability for device_.
    264   int cc_major_;
    265 
    266   // The minor verion of the compute capability for device_.
    267   int cc_minor_;
    268 
    269   // The plugin configuration associated with this instance.
    270   PluginConfig plugin_config_;
    271 
    272   SE_DISALLOW_COPY_AND_ASSIGN(CUDAExecutor);
    273 };
    274 
    275 }  // namespace cuda
    276 }  // namespace gputools
    277 }  // namespace perftools
    278 
    279 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
    280