1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // The CUDA implementation of the StreamExecutorInterface functionality. 17 // CUDA inclusions are ideally confined to this implementation file. 18 // 19 // The notions from the StreamExecutor basically correspond to the CUDA streams 20 // programming model provided by the libcuda.so driver APIs, so we don't have 21 // to do much more than wrap the calls to the libraries appropriately. 22 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_ 23 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_ 24 25 #include "tensorflow/stream_executor/kernel_cache_config.h" 26 #include "tensorflow/stream_executor/stream_executor_internal.h" 27 #include "tensorflow/stream_executor/cuda/cuda_driver.h" 28 #include "tensorflow/stream_executor/lib/casts.h" 29 #include "tensorflow/stream_executor/platform/port.h" 30 #include "tensorflow/stream_executor/platform/logging.h" 31 #include "cuda/include/cuda.h" 32 33 #ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_ 34 #error \ 35 "No driver calls in this file, wrap driver functionality in cuda_driver.cc." 36 #endif 37 38 #ifdef __CUDA_RUNTIME_H__ 39 #error \ 40 "CUDA runtime being included into CUDA GPU executor; should be driver only." 41 #endif 42 43 namespace perftools { 44 namespace gputools { 45 namespace cuda { 46 47 // Wraps a CUfunction to implement the platform-independent KernelInterface. 48 class CUDAKernel : public internal::KernelInterface { 49 public: 50 CUDAKernel() : cuda_function_(nullptr), arity_(0), 51 preferred_cache_config_(KernelCacheConfig::kNoPreference) {} 52 53 // Note that the function is unloaded when the module is unloaded, and the 54 // module that the function is contained in is owned by the CUDAExecutor. 55 ~CUDAKernel() override {} 56 57 // As arity cannot be reflected upon using the CUDA API, the arity is 58 // explicitly set during the CUDAExecutor::GetKernel initialization process. 59 void set_arity(unsigned arity) { arity_ = arity; } 60 unsigned Arity() const override { return arity_; } 61 62 // Returns the CUfunction value for passing to the CUDA API. 63 CUfunction AsCUDAFunctionValue() const { 64 DCHECK(cuda_function_ != nullptr); 65 return const_cast<CUfunction>(cuda_function_); 66 } 67 68 // Returns the slot that the CUfunction is stored within for this object, 69 // for the CUDA API which wants to load into a CUfunction*. 70 CUfunction *cuda_function_ptr() { return &cuda_function_; } 71 72 // CUDA supports setting the preferred cache configuration of a CUfunction 73 // (more-or-less equivalent to a CUDAKernel). We support this via the below 74 // functions; users can set a preference, and that is applied when the kernel 75 // is [lazy-]loaded (in CUDAExecutor::Launch). The alternative would be to 76 // load the kernel & set the preference when the user calls the setter below; 77 // either approach is valid. 78 // Sets the current kernel cache configuration preference. 79 void SetPreferredCacheConfig(KernelCacheConfig config) override { 80 preferred_cache_config_ = config; 81 } 82 83 // Returns the current kernel cache configuration preference. 84 KernelCacheConfig GetPreferredCacheConfig() const override { 85 return preferred_cache_config_; 86 } 87 88 // Returns the current kernel cache configuration preference as a 89 // CUfunc_cache. 90 CUfunc_cache GetCUDACacheConfig() const { 91 switch (preferred_cache_config_) { 92 case KernelCacheConfig::kNoPreference: 93 return CU_FUNC_CACHE_PREFER_NONE; 94 case KernelCacheConfig::kPreferShared: 95 return CU_FUNC_CACHE_PREFER_SHARED; 96 case KernelCacheConfig::kPreferL1: 97 return CU_FUNC_CACHE_PREFER_L1; 98 case KernelCacheConfig::kPreferEqual: 99 return CU_FUNC_CACHE_PREFER_EQUAL; 100 default: 101 LOG(FATAL) << "Unknown KernelCacheConfig" 102 << static_cast<int32>(preferred_cache_config_); 103 } 104 } 105 106 private: 107 CUfunction cuda_function_; // Wrapped CUDA kernel handle. 108 unsigned arity_; // Number of formal parameters the kernel takes. 109 110 // Preferred (but not required) cache configuration for this kernel. 111 KernelCacheConfig preferred_cache_config_; 112 }; 113 114 // Given a platform-independent kernel datatype, returns the (const) internal 115 // CUDA platform implementation pointer. 116 inline const CUDAKernel *AsCUDAKernel(const KernelBase *kernel) { 117 return static_cast<const CUDAKernel *>(kernel->implementation()); 118 } 119 120 // Given a platform-independent kernel datatype, returns the (non-const) 121 // internal CUDA platform implementation pointer. 122 inline CUDAKernel *AsCUDAKernel(KernelBase *kernel) { 123 return static_cast<CUDAKernel *>(kernel->implementation()); 124 } 125 126 } // namespace cuda 127 } // namespace gputools 128 } // namespace perftools 129 130 #endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_ 131