Home | History | Annotate | Download | only in cuda
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // The CUDA implementation of the StreamExecutorInterface functionality.
     17 // CUDA inclusions are ideally confined to this implementation file.
     18 //
     19 // The notions from the StreamExecutor basically correspond to the CUDA streams
     20 // programming model provided by the libcuda.so driver APIs, so we don't have
     21 // to do much more than wrap the calls to the libraries appropriately.
     22 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
     23 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
     24 
     25 #include "tensorflow/stream_executor/kernel_cache_config.h"
     26 #include "tensorflow/stream_executor/stream_executor_internal.h"
     27 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
     28 #include "tensorflow/stream_executor/lib/casts.h"
     29 #include "tensorflow/stream_executor/platform/port.h"
     30 #include "tensorflow/stream_executor/platform/logging.h"
     31 #include "cuda/include/cuda.h"
     32 
     33 #ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_
     34 #error \
     35     "No driver calls in this file, wrap driver functionality in cuda_driver.cc."
     36 #endif
     37 
     38 #ifdef __CUDA_RUNTIME_H__
     39 #error \
     40     "CUDA runtime being included into CUDA GPU executor; should be driver only."
     41 #endif
     42 
     43 namespace perftools {
     44 namespace gputools {
     45 namespace cuda {
     46 
     47 // Wraps a CUfunction to implement the platform-independent KernelInterface.
     48 class CUDAKernel : public internal::KernelInterface {
     49  public:
     50   CUDAKernel() : cuda_function_(nullptr), arity_(0),
     51                  preferred_cache_config_(KernelCacheConfig::kNoPreference) {}
     52 
     53   // Note that the function is unloaded when the module is unloaded, and the
     54   // module that the function is contained in is owned by the CUDAExecutor.
     55   ~CUDAKernel() override {}
     56 
     57   // As arity cannot be reflected upon using the CUDA API, the arity is
     58   // explicitly set during the CUDAExecutor::GetKernel initialization process.
     59   void set_arity(unsigned arity) { arity_ = arity; }
     60   unsigned Arity() const override { return arity_; }
     61 
     62   // Returns the CUfunction value for passing to the CUDA API.
     63   CUfunction AsCUDAFunctionValue() const {
     64     DCHECK(cuda_function_ != nullptr);
     65     return const_cast<CUfunction>(cuda_function_);
     66   }
     67 
     68   // Returns the slot that the CUfunction is stored within for this object,
     69   // for the CUDA API which wants to load into a CUfunction*.
     70   CUfunction *cuda_function_ptr() { return &cuda_function_; }
     71 
     72   // CUDA supports setting the preferred cache configuration of a CUfunction
     73   // (more-or-less equivalent to a CUDAKernel). We support this via the below
     74   // functions; users can set a preference, and that is applied when the kernel
     75   // is [lazy-]loaded (in CUDAExecutor::Launch). The alternative would be to
     76   // load the kernel & set the preference when the user calls the setter below;
     77   // either approach is valid.
     78   // Sets the current kernel cache configuration preference.
     79   void SetPreferredCacheConfig(KernelCacheConfig config) override {
     80     preferred_cache_config_ = config;
     81   }
     82 
     83   // Returns the current kernel cache configuration preference.
     84   KernelCacheConfig GetPreferredCacheConfig() const override {
     85     return preferred_cache_config_;
     86   }
     87 
     88   // Returns the current kernel cache configuration preference as a
     89   // CUfunc_cache.
     90   CUfunc_cache GetCUDACacheConfig() const {
     91     switch (preferred_cache_config_) {
     92       case KernelCacheConfig::kNoPreference:
     93         return CU_FUNC_CACHE_PREFER_NONE;
     94       case KernelCacheConfig::kPreferShared:
     95         return CU_FUNC_CACHE_PREFER_SHARED;
     96       case KernelCacheConfig::kPreferL1:
     97         return CU_FUNC_CACHE_PREFER_L1;
     98       case KernelCacheConfig::kPreferEqual:
     99         return CU_FUNC_CACHE_PREFER_EQUAL;
    100       default:
    101         LOG(FATAL) << "Unknown KernelCacheConfig"
    102                    << static_cast<int32>(preferred_cache_config_);
    103     }
    104   }
    105 
    106  private:
    107   CUfunction cuda_function_;  // Wrapped CUDA kernel handle.
    108   unsigned arity_;            // Number of formal parameters the kernel takes.
    109 
    110   // Preferred (but not required) cache configuration for this kernel.
    111   KernelCacheConfig preferred_cache_config_;
    112 };
    113 
    114 // Given a platform-independent kernel datatype, returns the (const) internal
    115 // CUDA platform implementation pointer.
    116 inline const CUDAKernel *AsCUDAKernel(const KernelBase *kernel) {
    117   return static_cast<const CUDAKernel *>(kernel->implementation());
    118 }
    119 
    120 // Given a platform-independent kernel datatype, returns the (non-const)
    121 // internal CUDA platform implementation pointer.
    122 inline CUDAKernel *AsCUDAKernel(KernelBase *kernel) {
    123   return static_cast<CUDAKernel *>(kernel->implementation());
    124 }
    125 
    126 }  // namespace cuda
    127 }  // namespace gputools
    128 }  // namespace perftools
    129 
    130 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
    131