1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog (at) gmail.com> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H) 11 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H 12 13 namespace Eigen { 14 15 static const int kCudaScratchSize = 1024; 16 17 // This defines an interface that GPUDevice can take to use 18 // CUDA streams underneath. 19 class StreamInterface { 20 public: 21 virtual ~StreamInterface() {} 22 23 virtual const cudaStream_t& stream() const = 0; 24 virtual const cudaDeviceProp& deviceProperties() const = 0; 25 26 // Allocate memory on the actual device where the computation will run 27 virtual void* allocate(size_t num_bytes) const = 0; 28 virtual void deallocate(void* buffer) const = 0; 29 30 // Return a scratchpad buffer of size 1k 31 virtual void* scratchpad() const = 0; 32 33 // Return a semaphore. The semaphore is initially initialized to 0, and 34 // each kernel using it is responsible for resetting to 0 upon completion 35 // to maintain the invariant that the semaphore is always equal to 0 upon 36 // each kernel start. 37 virtual unsigned int* semaphore() const = 0; 38 }; 39 40 static cudaDeviceProp* m_deviceProperties; 41 static bool m_devicePropInitialized = false; 42 43 static void initializeDeviceProp() { 44 if (!m_devicePropInitialized) { 45 // Attempts to ensure proper behavior in the case of multiple threads 46 // calling this function simultaneously. This would be trivial to 47 // implement if we could use std::mutex, but unfortunately mutex don't 48 // compile with nvcc, so we resort to atomics and thread fences instead. 49 // Note that if the caller uses a compiler that doesn't support c++11 we 50 // can't ensure that the initialization is thread safe. 51 #if __cplusplus >= 201103L 52 static std::atomic<bool> first(true); 53 if (first.exchange(false)) { 54 #else 55 static bool first = true; 56 if (first) { 57 first = false; 58 #endif 59 // We're the first thread to reach this point. 60 int num_devices; 61 cudaError_t status = cudaGetDeviceCount(&num_devices); 62 if (status != cudaSuccess) { 63 std::cerr << "Failed to get the number of CUDA devices: " 64 << cudaGetErrorString(status) 65 << std::endl; 66 assert(status == cudaSuccess); 67 } 68 m_deviceProperties = new cudaDeviceProp[num_devices]; 69 for (int i = 0; i < num_devices; ++i) { 70 status = cudaGetDeviceProperties(&m_deviceProperties[i], i); 71 if (status != cudaSuccess) { 72 std::cerr << "Failed to initialize CUDA device #" 73 << i 74 << ": " 75 << cudaGetErrorString(status) 76 << std::endl; 77 assert(status == cudaSuccess); 78 } 79 } 80 81 #if __cplusplus >= 201103L 82 std::atomic_thread_fence(std::memory_order_release); 83 #endif 84 m_devicePropInitialized = true; 85 } else { 86 // Wait for the other thread to inititialize the properties. 87 while (!m_devicePropInitialized) { 88 #if __cplusplus >= 201103L 89 std::atomic_thread_fence(std::memory_order_acquire); 90 #endif 91 sleep(1); 92 } 93 } 94 } 95 } 96 97 static const cudaStream_t default_stream = cudaStreamDefault; 98 99 class CudaStreamDevice : public StreamInterface { 100 public: 101 // Use the default stream on the current device 102 CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { 103 cudaGetDevice(&device_); 104 initializeDeviceProp(); 105 } 106 // Use the default stream on the specified device 107 CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { 108 initializeDeviceProp(); 109 } 110 // Use the specified stream. Note that it's the 111 // caller responsibility to ensure that the stream can run on 112 // the specified device. If no device is specified the code 113 // assumes that the stream is associated to the current gpu device. 114 CudaStreamDevice(const cudaStream_t* stream, int device = -1) 115 : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { 116 if (device < 0) { 117 cudaGetDevice(&device_); 118 } else { 119 int num_devices; 120 cudaError_t err = cudaGetDeviceCount(&num_devices); 121 EIGEN_UNUSED_VARIABLE(err) 122 assert(err == cudaSuccess); 123 assert(device < num_devices); 124 device_ = device; 125 } 126 initializeDeviceProp(); 127 } 128 129 virtual ~CudaStreamDevice() { 130 if (scratch_) { 131 deallocate(scratch_); 132 } 133 } 134 135 const cudaStream_t& stream() const { return *stream_; } 136 const cudaDeviceProp& deviceProperties() const { 137 return m_deviceProperties[device_]; 138 } 139 virtual void* allocate(size_t num_bytes) const { 140 cudaError_t err = cudaSetDevice(device_); 141 EIGEN_UNUSED_VARIABLE(err) 142 assert(err == cudaSuccess); 143 void* result; 144 err = cudaMalloc(&result, num_bytes); 145 assert(err == cudaSuccess); 146 assert(result != NULL); 147 return result; 148 } 149 virtual void deallocate(void* buffer) const { 150 cudaError_t err = cudaSetDevice(device_); 151 EIGEN_UNUSED_VARIABLE(err) 152 assert(err == cudaSuccess); 153 assert(buffer != NULL); 154 err = cudaFree(buffer); 155 assert(err == cudaSuccess); 156 } 157 158 virtual void* scratchpad() const { 159 if (scratch_ == NULL) { 160 scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int)); 161 } 162 return scratch_; 163 } 164 165 virtual unsigned int* semaphore() const { 166 if (semaphore_ == NULL) { 167 char* scratch = static_cast<char*>(scratchpad()) + kCudaScratchSize; 168 semaphore_ = reinterpret_cast<unsigned int*>(scratch); 169 cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); 170 EIGEN_UNUSED_VARIABLE(err) 171 assert(err == cudaSuccess); 172 } 173 return semaphore_; 174 } 175 176 private: 177 const cudaStream_t* stream_; 178 int device_; 179 mutable void* scratch_; 180 mutable unsigned int* semaphore_; 181 }; 182 183 struct GpuDevice { 184 // The StreamInterface is not owned: the caller is 185 // responsible for its initialization and eventual destruction. 186 explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { 187 eigen_assert(stream); 188 } 189 explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { 190 eigen_assert(stream); 191 } 192 // TODO(bsteiner): This is an internal API, we should not expose it. 193 EIGEN_STRONG_INLINE const cudaStream_t& stream() const { 194 return stream_->stream(); 195 } 196 197 EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { 198 return stream_->allocate(num_bytes); 199 } 200 201 EIGEN_STRONG_INLINE void deallocate(void* buffer) const { 202 stream_->deallocate(buffer); 203 } 204 205 EIGEN_STRONG_INLINE void* scratchpad() const { 206 return stream_->scratchpad(); 207 } 208 209 EIGEN_STRONG_INLINE unsigned int* semaphore() const { 210 return stream_->semaphore(); 211 } 212 213 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { 214 #ifndef __CUDA_ARCH__ 215 cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, 216 stream_->stream()); 217 EIGEN_UNUSED_VARIABLE(err) 218 assert(err == cudaSuccess); 219 #else 220 eigen_assert(false && "The default device should be used instead to generate kernel code"); 221 #endif 222 } 223 224 EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { 225 cudaError_t err = 226 cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream()); 227 EIGEN_UNUSED_VARIABLE(err) 228 assert(err == cudaSuccess); 229 } 230 231 EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { 232 cudaError_t err = 233 cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream()); 234 EIGEN_UNUSED_VARIABLE(err) 235 assert(err == cudaSuccess); 236 } 237 238 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { 239 #ifndef __CUDA_ARCH__ 240 cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream()); 241 EIGEN_UNUSED_VARIABLE(err) 242 assert(err == cudaSuccess); 243 #else 244 eigen_assert(false && "The default device should be used instead to generate kernel code"); 245 #endif 246 } 247 248 EIGEN_STRONG_INLINE size_t numThreads() const { 249 // FIXME 250 return 32; 251 } 252 253 EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { 254 // FIXME 255 return 48*1024; 256 } 257 258 EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { 259 // We won't try to take advantage of the l2 cache for the time being, and 260 // there is no l3 cache on cuda devices. 261 return firstLevelCacheSize(); 262 } 263 264 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { 265 #if defined(__CUDACC__) && !defined(__CUDA_ARCH__) 266 cudaError_t err = cudaStreamSynchronize(stream_->stream()); 267 if (err != cudaSuccess) { 268 std::cerr << "Error detected in CUDA stream: " 269 << cudaGetErrorString(err) 270 << std::endl; 271 assert(err == cudaSuccess); 272 } 273 #else 274 assert(false && "The default device should be used instead to generate kernel code"); 275 #endif 276 } 277 278 EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const { 279 return stream_->deviceProperties().multiProcessorCount; 280 } 281 EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const { 282 return stream_->deviceProperties().maxThreadsPerBlock; 283 } 284 EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const { 285 return stream_->deviceProperties().maxThreadsPerMultiProcessor; 286 } 287 EIGEN_STRONG_INLINE int sharedMemPerBlock() const { 288 return stream_->deviceProperties().sharedMemPerBlock; 289 } 290 EIGEN_STRONG_INLINE int majorDeviceVersion() const { 291 return stream_->deviceProperties().major; 292 } 293 EIGEN_STRONG_INLINE int minorDeviceVersion() const { 294 return stream_->deviceProperties().minor; 295 } 296 297 EIGEN_STRONG_INLINE int maxBlocks() const { 298 return max_blocks_; 299 } 300 301 // This function checks if the CUDA runtime recorded an error for the 302 // underlying stream device. 303 inline bool ok() const { 304 #ifdef __CUDACC__ 305 cudaError_t error = cudaStreamQuery(stream_->stream()); 306 return (error == cudaSuccess) || (error == cudaErrorNotReady); 307 #else 308 return false; 309 #endif 310 } 311 312 private: 313 const StreamInterface* stream_; 314 int max_blocks_; 315 }; 316 317 #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ 318 (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ 319 assert(cudaGetLastError() == cudaSuccess); 320 321 322 // FIXME: Should be device and kernel specific. 323 #ifdef __CUDACC__ 324 static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { 325 #ifndef __CUDA_ARCH__ 326 cudaError_t status = cudaDeviceSetSharedMemConfig(config); 327 EIGEN_UNUSED_VARIABLE(status) 328 assert(status == cudaSuccess); 329 #else 330 EIGEN_UNUSED_VARIABLE(config) 331 #endif 332 } 333 #endif 334 335 } // end namespace Eigen 336 337 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H 338