1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/stream_executor/cuda/cuda_driver.h" 17 18 #include <stdint.h> 19 #include <stdlib.h> 20 #include <map> 21 #include <set> 22 #include <utility> 23 24 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h" 25 #include "tensorflow/stream_executor/lib/casts.h" 26 #include "tensorflow/stream_executor/lib/env.h" 27 #include "tensorflow/stream_executor/lib/error.h" 28 #include "tensorflow/stream_executor/lib/human_readable.h" 29 #include "tensorflow/stream_executor/lib/notification.h" 30 #include "tensorflow/stream_executor/lib/threadpool.h" 31 #include "tensorflow/stream_executor/lib/stacktrace.h" 32 #include "tensorflow/stream_executor/lib/static_threadlocal.h" 33 #include "tensorflow/stream_executor/lib/strcat.h" 34 #include "tensorflow/stream_executor/lib/stringprintf.h" 35 #include "tensorflow/stream_executor/platform/logging.h" 36 #include "tensorflow/stream_executor/platform/mutex.h" 37 #include "tensorflow/stream_executor/platform/port.h" 38 #include "tensorflow/stream_executor/lib/inlined_vector.h" 39 40 #if defined(PLATFORM_WINDOWS) 41 // TODO: in windows ARRAYSIZE is defined in winnt.h but including it 42 // here creates a conflict with cuda.h - for now define it here. 43 #define ARRAYSIZE(a) \ 44 ((sizeof(a) / sizeof(*(a))) / \ 45 static_cast<size_t>(!(sizeof(a) % sizeof(*(a))))) 46 #endif 47 48 bool FLAGS_gpuexec_cuda_driver_inject_init_error = false; 49 bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false; 50 bool FLAGS_gpuexec_cuda_device_0_only = false; 51 52 // Debugging: on each push and pop of a cuda context, verify the current context 53 // matches the expected one. 54 constexpr bool kVerifyCudaContext = false; 55 56 namespace perftools { 57 namespace gputools { 58 namespace cuda { 59 60 namespace { 61 62 // Manages the singleton map of contexts that we've created, mapping 63 // from the CUcontext to the CudaContext* that we pass around internally. 64 // This also manages assignment of unique ids to CudaContexts, to allow 65 // for fast comparison of a context against the current context. 66 // 67 // CUDA-runtime-created contexts are avoided, if triple angle 68 // brace launches are required, by using the scoped activations in 69 // cuda_activation.h. 70 class CreatedContexts { 71 public: 72 // Returns whether context is a member of the live set. 73 static bool Has(CUcontext context) { 74 tf_shared_lock lock{mu_}; 75 return Live()->find(context) != Live()->end(); 76 } 77 78 // Adds context to the live set. 79 static CudaContext* Add(CUcontext context) { 80 CHECK(context != nullptr); 81 mutex_lock lock{mu_}; 82 auto cuda_context = new CudaContext(context, next_id_++); 83 Live()->insert( 84 std::make_pair(context, std::unique_ptr<CudaContext>(cuda_context))); 85 return cuda_context; 86 } 87 88 // Removes context from the live set. 89 static void Remove(CUcontext context) { 90 CHECK(context != nullptr); 91 mutex_lock lock{mu_}; 92 auto it = Live()->find(context); 93 CHECK(it != Live()->end()) << context; 94 Live()->erase(it); 95 } 96 97 private: 98 // Returns the live map singleton. 99 static std::map<CUcontext, std::unique_ptr<CudaContext>> *Live() { 100 static auto singleton = 101 new std::map<CUcontext, std::unique_ptr<CudaContext>>; 102 return singleton; 103 } 104 105 // Lock that guards access-to/mutation-of the live set. 106 static mutex mu_; 107 static int64 next_id_; 108 }; 109 110 /* static */ mutex CreatedContexts::mu_{LINKER_INITIALIZED}; 111 /* static */ int64 CreatedContexts::next_id_ = 1; // 0 means "no context" 112 113 // Formats CUresult to output prettified values into a log stream. 114 // Error summaries taken from: 115 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc6c391505e117393cc2558fff6bfc2e9 116 // 117 // TODO(leary) switch to cuGetErrorName when updated cuda.h is available. 118 string ToString(CUresult result) { 119 #define OSTREAM_CUDA_ERROR(__name) \ 120 case CUDA_ERROR_##__name: \ 121 return "CUDA_ERROR_" #__name; 122 123 /////////////// 124 // NOTE: here we specify return code values outside of the enum explicitly 125 // because our in-tree cuda.h is from the CUDA 5.5 SDK, but CUDA 6.0+ driver 126 // libraries are deployed in the fleet these error codes are backwards 127 // compatible, but if we see a "new" one, we want to be able to identify it in 128 // the logs. 129 // 130 // Once we get a cuda.h that has cuGetErrorName (TODO is above) we can 131 // eliminate this function and just rely on the driver to provide us these 132 // strings. 133 // 134 // NOTE: "Must reboot all context" below is shorthand for, "must 135 // destroy/recreate the offending context and any allocation which come from 136 // it if you are to continue using CUDA." 137 #pragma GCC diagnostic push 138 #pragma GCC diagnostic ignored "-Wswitch" 139 switch (result) { 140 OSTREAM_CUDA_ERROR(INVALID_VALUE) 141 OSTREAM_CUDA_ERROR(OUT_OF_MEMORY) 142 OSTREAM_CUDA_ERROR(NOT_INITIALIZED) 143 OSTREAM_CUDA_ERROR(DEINITIALIZED) 144 OSTREAM_CUDA_ERROR(NO_DEVICE) 145 OSTREAM_CUDA_ERROR(INVALID_DEVICE) 146 OSTREAM_CUDA_ERROR(INVALID_IMAGE) 147 OSTREAM_CUDA_ERROR(INVALID_CONTEXT) 148 OSTREAM_CUDA_ERROR(INVALID_HANDLE) 149 OSTREAM_CUDA_ERROR(NOT_FOUND) 150 OSTREAM_CUDA_ERROR(NOT_READY) 151 OSTREAM_CUDA_ERROR(NO_BINARY_FOR_GPU) 152 153 // Encountered an uncorrectable ECC error during execution. 154 OSTREAM_CUDA_ERROR(ECC_UNCORRECTABLE) 155 156 // Load/store on an invalid address. Must reboot all context. 157 case 700: 158 return "CUDA_ERROR_ILLEGAL_ADDRESS"; 159 // Passed too many / wrong arguments, too many threads for register count. 160 case 701: 161 return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"; 162 // Kernel took too long to execute. 163 case 702: 164 return "CUDA_ERROR_LAUNCH_TIMEOUT"; 165 // Kernel launch uses an incompatible texturing mode. 166 case 703: 167 return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"; 168 // Trying to re-enable peer access that already has it enabled. 169 case 704: 170 return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"; 171 // Trying to disable peer access that has not yet been enabled. 172 case 705: 173 return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"; 174 // Primary context for the specified device has already been initialized. 175 case 708: 176 return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"; 177 // Context current to calling thread has been destroyed or is a primary 178 // context that has not yet been initialized. 179 case 709: 180 return "CUDA_ERROR_CONTEXT_IS_DESTROYED"; 181 // Device-side assert triggered during kernel execution. Must reboot all 182 // context. 183 case 710: 184 return "CUDA_ERROR_ASSERT"; 185 // Hardware resources to enable peer access have been exhausted. 186 case 711: 187 return "CUDA_ERROR_TOO_MANY_PEERS"; 188 // Memory range has already been registered. 189 case 712: 190 return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"; 191 // Pointer does not correspond to any currently registered memory region. 192 case 713: 193 return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"; 194 // Due to stack corruption or exceeding stack size limit. Must reboot all 195 // context. 196 case 714: 197 return "CUDA_ERROR_HARDWARE_STACK_ERROR"; 198 case 715: 199 return "CUDA_ERROR_ILLEGAL_INSTRUCTION"; 200 // Load/store on an unaligned memory address. Must reboot all context. 201 case 716: 202 return "CUDA_ERROR_MISALIGNED_ADDRESS"; 203 // Device instruction with specific address space given address not 204 // belonging to allowed address space. Must reboot all context. 205 case 717: 206 return "CUDA_ERROR_INVALID_ADDRESS_SPACE"; 207 // Device program counter wrapped its address space. Must reboot all 208 // context. 209 case 718: 210 return "CUDA_ERROR_INVALID_PC"; 211 // Exception on device while executing a kernel; e.g. deref invalid device 212 // pointer, accessing OOB shared memory. Must reboot all context. 213 case 719: 214 return "CUDA_ERROR_LAUNCH_FAILED"; 215 216 OSTREAM_CUDA_ERROR(CONTEXT_ALREADY_IN_USE) 217 OSTREAM_CUDA_ERROR(PEER_ACCESS_UNSUPPORTED) 218 OSTREAM_CUDA_ERROR(NOT_PERMITTED) 219 OSTREAM_CUDA_ERROR(NOT_SUPPORTED) 220 OSTREAM_CUDA_ERROR(UNKNOWN) // Unknown internal error to CUDA. 221 default: 222 return port::StrCat("CUresult(", static_cast<int>(result), ")"); 223 } 224 #pragma GCC diagnostic pop 225 } 226 227 // Returns the current context and checks that it is in the set of CUDA contexts 228 // created by StreamExecutor (to ensure that the CUDA runtime didn't create a 229 // context behind our backs). 230 CUcontext CurrentContext() { 231 CUcontext current = CUDADriver::CurrentContextOrDie(); 232 if (current != nullptr && !CreatedContexts::Has(current)) { 233 LOG(FATAL) << "current context was not created by the StreamExecutor " 234 "cuda_driver API: " 235 << current 236 << "; a CUDA runtime call " 237 "was likely performed without using a StreamExecutor context"; 238 } 239 return current; 240 } 241 242 // CUDA driver routines may require a large amount of stack (particularly 243 // cuModuleLoadDataEx, in our experience). To avoid stack overflow when using 244 // stack-limited threads (such as those spawned by a default-argument 245 // thread::ThreadPool on some platforms), we run certain routines in this pool 246 // and wait for completion. 247 static mutex driver_executor_threadpool_mu(LINKER_INITIALIZED); 248 static port::ThreadPool *InitializeDriverExecutor() { 249 return new port::ThreadPool(port::Env::Default(), port::ThreadOptions(), 250 "cuda_driver", 1); 251 } 252 253 port::ThreadPool *GetDriverExecutor() { 254 mutex_lock lock(driver_executor_threadpool_mu); 255 static port::ThreadPool *thread_pool = InitializeDriverExecutor(); 256 return thread_pool; 257 } 258 259 } // namespace 260 261 string MemorySpaceString(MemorySpace memory_space) { 262 switch (memory_space) { 263 case MemorySpace::kHost: 264 return "host"; 265 case MemorySpace::kDevice: 266 return "device"; 267 default: 268 LOG(FATAL) << "impossible memory space"; 269 } 270 } 271 272 namespace { 273 274 // Call cuCtxtSynchronize and crash if it doesn't succeed. 275 void SynchronizeOrDie() { 276 auto res = cuCtxSynchronize(); 277 if (res != CUDA_SUCCESS) { 278 LOG(FATAL) << "Synchronize found " 279 << ToString(res) << " :: " << port::CurrentStackTrace(); 280 } 281 } 282 283 struct ThreadLocalData { 284 int64 id; 285 CudaContext* context; // Only valid if id == a known good context. 286 int depth; 287 }; 288 289 SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data); 290 291 } // namespace 292 293 ScopedActivateContext::ScopedActivateContext(CudaContext* cuda_context) { 294 if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie(); 295 296 auto* tls = &tls_data.get(); 297 tls->depth++; 298 if (tls->id == cuda_context->id()) { 299 if (kVerifyCudaContext) { 300 CHECK_EQ(CurrentContext(), cuda_context->context()); 301 } 302 DCHECK_EQ(CurrentContext(), cuda_context->context()); 303 return; 304 } 305 306 VLOG(3) << "ScopedActivateContext switching context from " << tls->id 307 << " to " << cuda_context->id(); 308 309 to_restore_ = (tls->depth == 1 ? nullptr : tls->context); 310 311 // Set the context and update thread local. 312 CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(cuda_context->context())); 313 tls->id = cuda_context->id(); 314 tls->context = cuda_context; 315 } 316 317 ScopedActivateContext::~ScopedActivateContext() { 318 if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie(); 319 320 auto* tls = &tls_data.get(); 321 322 if (kVerifyCudaContext) { 323 // Note that if kVerifyCudaContext is used, and contexts are deleted, it's 324 // possible this could fail in the CurrentContext() call. 325 CHECK_EQ(CurrentContext(), 326 tls->context == nullptr ? nullptr : tls->context->context()); 327 } 328 329 tls->depth--; 330 DCHECK_GE(tls->depth, 0); 331 if (to_restore_ == nullptr) { 332 // Leave context, tls->id, and tls->context set. 333 return; 334 } 335 336 // Set context and update thread local. 337 CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(to_restore_->context())); 338 tls->id = to_restore_->id(); 339 tls->context = to_restore_; 340 } 341 342 namespace { 343 344 // Returns a stringified device number associated with pointer, primarily for 345 // logging purposes. Returns "?" if the device could not be successfully 346 // queried. 347 string CUDAPointerToDeviceString(CUdeviceptr pointer) { 348 auto value = CUDADriver::GetPointerDevice(pointer); 349 if (value.ok()) { 350 return port::StrCat(value.ValueOrDie()); 351 } 352 LOG(ERROR) << "could not query device: " << value.status(); 353 return "?"; 354 } 355 356 // Returns a stringified memory space associated with pointer, primarily for 357 // logging purposes. Returns "?" if the memory space could not be successfully 358 // queried. 359 string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) { 360 auto value = CUDADriver::GetPointerMemorySpace(pointer); 361 if (value.ok()) { 362 return MemorySpaceString(value.ValueOrDie()); 363 } 364 LOG(ERROR) << "could not query device: " << value.status(); 365 return "?"; 366 } 367 368 // Returns a stringified representation of whether or not peer access is 369 // permitted between the "from" and "to" pointers' associated contexts, 370 // primarily for logging purposes. Returns "error" if an error is encountered 371 // in the process of querying. 372 string CUDAPointersToCanAccessString(CUdeviceptr from, CUdeviceptr to) { 373 auto from_context = CUDADriver::GetPointerContext(from); 374 if (!from_context.ok()) { 375 LOG(ERROR) << "could not retrieve source pointer's context: " 376 << from_context.status(); 377 return "error"; 378 } 379 auto to_context = CUDADriver::GetPointerContext(to); 380 if (!to_context.ok()) { 381 LOG(ERROR) << "could not retrieve destination pointer's context: " 382 << to_context.status(); 383 return "error"; 384 } 385 return CUDADriver::CanEnablePeerAccess(from_context.ValueOrDie(), 386 to_context.ValueOrDie()) 387 ? "true" 388 : "false"; 389 } 390 391 392 // Actually performs the work of CUDA initialization. Wrapped up in one-time 393 // execution guard. 394 static port::Status InternalInit() { 395 CUresult res = CUDA_ERROR_NO_DEVICE; 396 if (FLAGS_gpuexec_cuda_driver_inject_init_error) { 397 LOG(ERROR) << "injecting CUDA init error; initialization will fail"; 398 } else { 399 res = cuInit(0 /* = flags */); 400 } 401 402 if (res == CUDA_SUCCESS) { 403 return port::Status::OK(); 404 } 405 406 LOG(ERROR) << "failed call to cuInit: " << ToString(res); 407 Diagnostician::LogDiagnosticInformation(); 408 return port::Status{port::error::ABORTED, 409 port::StrCat("failed call to cuInit: ", ToString(res))}; 410 } 411 412 } // namespace 413 414 /* static */ port::Status CUDADriver::Init() { 415 // Cached return value from calling InternalInit(), as cuInit need only be 416 // called once, but CUDADriver::Init may be called many times. 417 static port::Status init_retval; 418 static bool set = false; 419 static mutex *init_mu = new mutex; 420 421 mutex_lock lock(*init_mu); 422 if (!set) { 423 init_retval = InternalInit(); 424 set = true; 425 } 426 427 return init_retval; 428 } 429 430 /* static */ port::Status CUDADriver::GetDevice(int device_ordinal, 431 CUdevice *device) { 432 CUresult res = cuDeviceGet(device, device_ordinal); 433 if (res == CUDA_SUCCESS) { 434 return port::Status::OK(); 435 } 436 437 return port::Status{ 438 port::error::INTERNAL, 439 port::StrCat("failed call to cuDeviceGet: ", ToString(res))}; 440 } 441 442 /* static */ bool CUDADriver::GetDeviceName(CUdevice device, 443 string *device_name) { 444 static const size_t kCharLimit = 64; 445 port::InlinedVector<char, 4> chars(kCharLimit); 446 CUresult res = cuDeviceGetName(chars.begin(), kCharLimit - 1, device); 447 if (res != CUDA_SUCCESS) { 448 LOG(ERROR) << "failed to get device name for " << device << ": " 449 << ToString(res); 450 return false; 451 } 452 chars[kCharLimit - 1] = '\0'; 453 *device_name = chars.begin(); 454 return true; 455 } 456 457 bool DeviceOptionsToContextFlags(const DeviceOptions &device_options, 458 int *flags) { 459 static_assert(DeviceOptions::kMask == 0xf, 460 "needs update for new device options"); 461 462 if (device_options.flags() & DeviceOptions::kDoNotReclaimStackAllocation) { 463 *flags |= CU_CTX_LMEM_RESIZE_TO_MAX; 464 } 465 466 // If no flags are set the default is CU_CTX_SCHED_AUTO, which 467 // in Google environments is very likely to mean SPIN. 468 if (device_options.flags() & DeviceOptions::kScheduleSpin) { 469 *flags |= CU_CTX_SCHED_SPIN; 470 } 471 if (device_options.flags() & DeviceOptions::kScheduleYield) { 472 *flags |= CU_CTX_SCHED_YIELD; 473 } 474 if (device_options.flags() & DeviceOptions::kScheduleBlockingSync) { 475 *flags |= CU_CTX_SCHED_BLOCKING_SYNC; 476 } 477 478 return true; 479 } 480 481 /* static */ port::Status CUDADriver::CreateContext( 482 CUdevice device, DeviceOptions device_options, CudaContext** context) { 483 *context = nullptr; 484 485 int flags = 0; 486 if (!DeviceOptionsToContextFlags(device_options, &flags)) { 487 LOG(WARNING) << "could not convert all device options into context flags"; 488 } 489 490 CUresult res; 491 CUcontext former_context; 492 CUcontext new_context; 493 { 494 // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their 495 // context creation: see http://b/13248943 496 497 #if CUDA_VERSION >= 7000 498 { 499 unsigned int former_primary_context_flags; 500 int former_primary_context_is_active; 501 CHECK_EQ(CUDA_SUCCESS, 502 cuDevicePrimaryCtxGetState(device, &former_primary_context_flags, 503 &former_primary_context_is_active)); 504 if (former_primary_context_flags != flags) { 505 if (former_primary_context_is_active) { 506 LOG(ERROR) 507 << "The primary context is active and has a different flag set (" 508 << former_primary_context_flags << ") than the desired flag set (" 509 << flags << ")."; 510 } else { 511 CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags)); 512 } 513 } 514 } 515 516 former_context = CUDADriver::CurrentContextOrDie(); 517 res = cuDevicePrimaryCtxRetain(&new_context, device); 518 if (former_context != nullptr) { 519 CUdevice former_device; 520 if (cuCtxGetDevice(&former_device) == CUDA_SUCCESS) { 521 if (former_device == device) { 522 if (former_context == new_context) { 523 VLOG(2) << "The primary context " << former_context 524 << " for device " << device 525 << " exists before initializing the StreamExecutor."; 526 } else { 527 LOG(WARNING) 528 << "A non-primary context " << former_context << " for device " 529 << device 530 << " exists before initializing the StreamExecutor. The " 531 << "primary context is now " << new_context << ". We " 532 << "haven't verified StreamExecutor works with that."; 533 } 534 } 535 } else { 536 LOG(ERROR) << "Failed to get the device of the current context " 537 << former_context; 538 } 539 } 540 #else 541 former_context = CurrentContext(); 542 if (former_context != nullptr) { 543 LOG(WARNING) 544 << "creating context when one is currently active; existing: " 545 << former_context; 546 } 547 res = cuCtxCreate(&new_context, flags, device); 548 #endif 549 } 550 CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(former_context)); 551 552 if (res == CUDA_SUCCESS) { 553 *context = CreatedContexts::Add(new_context); 554 CHECK(*context != nullptr) 555 << "success in this call must entail non-null result"; 556 VLOG(2) << "created context " << context << " for this thread"; 557 return port::Status::OK(); 558 } 559 560 #if CUDA_VERSION >= 7000 561 string message = "failed call to cuDevicePrimaryCtxRetain: " + ToString(res); 562 #else 563 string message = "failed call to cuCtxCreate: " + ToString(res); 564 #endif 565 if (res == CUDA_ERROR_OUT_OF_MEMORY) { 566 uint64 total_memory; 567 if (GetDeviceTotalMemory(device, &total_memory)) { 568 port::StrAppend(&message, "; total memory reported: ", total_memory); 569 } else { 570 port::StrAppend(&message, "; could not query total memory"); 571 } 572 } 573 574 return port::Status{port::error::INTERNAL, message}; 575 } 576 577 /* static */ void CUDADriver::DestroyContext(CudaContext* context) { 578 if (context == nullptr) { 579 return; 580 } 581 #if CUDA_VERSION >= 7000 582 CUcontext former_context = CurrentContext(); 583 CUresult res = cuCtxSetCurrent(context->context()); 584 CUdevice device; 585 cuCtxGetDevice(&device); 586 cuCtxSetCurrent(former_context); 587 588 res = cuDevicePrimaryCtxRelease(device); 589 #else 590 CUresult res = cuCtxDestroy(context->context()); 591 #endif 592 593 if (res != CUDA_SUCCESS) { 594 LOG(ERROR) << "failed to release CUDA context; leaking: " << ToString(res); 595 } 596 597 CreatedContexts::Remove(context->context()); 598 } 599 600 /* static */ bool CUDADriver::FuncGetAttribute(CUfunction_attribute attribute, 601 CUfunction func, 602 int *attribute_value) { 603 CUresult res = cuFuncGetAttribute(attribute_value, attribute, func); 604 if (res != CUDA_SUCCESS) { 605 LOG(ERROR) << "failed to query kernel attribute. kernel: " << func 606 << ", attribute: " << attribute; 607 return false; 608 } 609 return true; 610 } 611 612 /* static */ bool CUDADriver::FuncSetCacheConfig(CUfunction function, 613 CUfunc_cache cache_config) { 614 CUresult res = cuFuncSetCacheConfig(function, cache_config); 615 if (res != CUDA_SUCCESS) { 616 LOG(ERROR) << "failed to set CUDA kernel cache config. kernel: " << function 617 << ", config: " << cache_config << ", result: " << ToString(res); 618 return false; 619 } 620 621 return true; 622 } 623 624 /* static */ port::StatusOr<CUsharedconfig> 625 CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { 626 CUsharedconfig shared_mem_config; 627 ScopedActivateContext activation{context}; 628 CUresult result = cuCtxGetSharedMemConfig(&shared_mem_config); 629 if (result != CUDA_SUCCESS) { 630 CUdevice device; 631 cuCtxGetDevice(&device); 632 LOG(ERROR) << "failed to get CUDA device shared memory config. " 633 << "Context device ID: " << device 634 << ", result: " << ToString(result); 635 return port::Status{ 636 port::error::INTERNAL, 637 port::StrCat("failed to get shared memory config: ", ToString(result))}; 638 } 639 return shared_mem_config; 640 } 641 642 /* static */ port::Status CUDADriver::ContextSetSharedMemConfig( 643 CudaContext* context, CUsharedconfig shared_mem_config) { 644 ScopedActivateContext activation{context}; 645 CUresult result = cuCtxSetSharedMemConfig(shared_mem_config); 646 if (result != CUDA_SUCCESS) { 647 CUdevice device; 648 cuCtxGetDevice(&device); 649 LOG(ERROR) << "failed to set CUDA device shared memory config. " 650 << "Context device ID: " << device 651 << ", config: " << shared_mem_config 652 << ", result: " << ToString(result); 653 return port::Status{ 654 port::error::INTERNAL, 655 port::StrCat("failed to set shared memory config: ", ToString(result))}; 656 } 657 return port::Status::OK(); 658 } 659 660 /* static */ bool CUDADriver::LaunchKernel( 661 CudaContext* context, CUfunction function, unsigned int grid_dim_x, 662 unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x, 663 unsigned int block_dim_y, unsigned int block_dim_z, 664 unsigned int shared_mem_bytes, CUstream stream, void **kernel_params, 665 void **extra) { 666 ScopedActivateContext activation{context}; 667 VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x 668 << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z 669 << " bdx: " << block_dim_x << " bdy: " << block_dim_y 670 << " bdz: " << block_dim_z; 671 CUresult res = cuLaunchKernel(function, grid_dim_x, grid_dim_y, grid_dim_z, 672 block_dim_x, block_dim_y, block_dim_z, 673 shared_mem_bytes, stream, kernel_params, extra); 674 if (res != CUDA_SUCCESS) { 675 LOG(ERROR) << "failed to launch CUDA kernel: " << function 676 << "; result: " << ToString(res); 677 return false; 678 } 679 VLOG(2) << "successfully launched kernel"; 680 return true; 681 } 682 683 /* static */ port::Status CUDADriver::LoadCubin(CudaContext* context, 684 const char *cubin_bytes, 685 CUmodule *module) { 686 ScopedActivateContext activation{context}; 687 CUresult result = cuModuleLoadFatBinary(module, cubin_bytes); 688 if (result != CUDA_SUCCESS) { 689 return port::Status{port::error::INTERNAL, 690 "failed to load in-memory CUBIN: " + ToString(result)}; 691 } 692 693 return port::Status::OK(); 694 } 695 696 /* static */ bool CUDADriver::LoadPtx(CudaContext* context, 697 const char *ptx_contents, 698 CUmodule *module) { 699 port::Notification notification; 700 bool ret = true; 701 GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret, 702 ¬ification]() { 703 ScopedActivateContext activation{context}; 704 void *ptx_data = const_cast<char *>(ptx_contents); 705 static const unsigned int kLogBufferBytesLimit = 1024; 706 unsigned int error_log_buffer_bytes = kLogBufferBytesLimit; 707 unsigned int info_log_buffer_bytes = kLogBufferBytesLimit; 708 port::InlinedVector<char, 4> error_log_buffer(error_log_buffer_bytes); 709 port::InlinedVector<char, 4> info_log_buffer(info_log_buffer_bytes); 710 bool log_verbose = true; 711 CUjit_option options[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, 712 CU_JIT_ERROR_LOG_BUFFER, 713 CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, 714 CU_JIT_INFO_LOG_BUFFER, CU_JIT_LOG_VERBOSE}; 715 // Note that the driver API wants the contents of this values to be stored 716 // in an array of void*s, so we coerce them accordingly. 717 void *option_values[] = { 718 port::bit_cast<void *>(uintptr_t(error_log_buffer_bytes)), 719 port::bit_cast<void *>(error_log_buffer.data()), 720 port::bit_cast<void *>(uintptr_t(info_log_buffer_bytes)), 721 port::bit_cast<void *>(info_log_buffer.data()), 722 port::bit_cast<void *>(uintptr_t(log_verbose))}; 723 CHECK(ARRAYSIZE(options) == ARRAYSIZE(option_values)); 724 725 CUresult res; 726 { 727 // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their 728 // module loading: see http://b/13248943 729 730 res = cuModuleLoadDataEx(module, ptx_data, ARRAYSIZE(options), options, 731 option_values); 732 } 733 734 // The PTX JIT mutates the values in the option values array to reflect the 735 // size of the logs it output; now that we've made the call, read the values 736 // back out. 737 error_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[0]); 738 info_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[2]); 739 CHECK_LE(error_log_buffer_bytes, kLogBufferBytesLimit); 740 CHECK_LE(info_log_buffer_bytes, kLogBufferBytesLimit); 741 742 if (res != CUDA_SUCCESS) { 743 LOG(ERROR) << "failed to load PTX text as a module: " << ToString(res); 744 // As a precaution for null termination of the API-provided value, ensure 745 // that at least the last byte is null. 746 error_log_buffer[error_log_buffer_bytes ? 747 error_log_buffer_bytes - 1 : 0] = '\0'; 748 LOG(ERROR) << "error log buffer (" << error_log_buffer_bytes 749 << " bytes): " << error_log_buffer.data(); 750 ret = false; 751 notification.Notify(); 752 } 753 754 VLOG(3) << "PTX compilation info log (" << info_log_buffer_bytes 755 << " bytes): " << info_log_buffer.data(); 756 VLOG(3) << "PTX compilation error log (" << error_log_buffer_bytes 757 << " bytes): " << error_log_buffer.data(); 758 CHECK(module != nullptr); 759 notification.Notify(); 760 }); 761 notification.WaitForNotification(); 762 763 return ret; 764 } 765 766 /* static */ bool CUDADriver::SynchronousMemsetUint8(CudaContext* context, 767 CUdeviceptr location, 768 uint8 value, size_t size) { 769 ScopedActivateContext activation{context}; 770 CUresult res = cuMemsetD8(location, value, size); 771 if (res != CUDA_SUCCESS) { 772 LOG(ERROR) << "failed to memset memory: " << ToString(res); 773 return false; 774 } 775 return true; 776 } 777 778 /* static */ bool CUDADriver::SynchronousMemsetUint32(CudaContext* context, 779 CUdeviceptr location, 780 uint32 value, 781 size_t uint32_count) { 782 ScopedActivateContext activation{context}; 783 CUresult res = cuMemsetD32(location, value, uint32_count); 784 if (res != CUDA_SUCCESS) { 785 LOG(ERROR) << "failed to memset memory: " << ToString(res); 786 return false; 787 } 788 return true; 789 } 790 791 /* static */ bool CUDADriver::AsynchronousMemsetUint8(CudaContext* context, 792 CUdeviceptr location, 793 uint8 value, 794 size_t uint32_count, 795 CUstream stream) { 796 ScopedActivateContext activation{context}; 797 CUresult res = cuMemsetD8Async(location, value, uint32_count, stream); 798 if (res != CUDA_SUCCESS) { 799 LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res); 800 return false; 801 } 802 VLOG(2) << "successfully enqueued async memset operation"; 803 return true; 804 } 805 806 /* static */ bool CUDADriver::AsynchronousMemsetUint32(CudaContext* context, 807 CUdeviceptr location, 808 uint32 value, 809 size_t uint32_count, 810 CUstream stream) { 811 ScopedActivateContext activation{context}; 812 CUresult res = cuMemsetD32Async(location, value, uint32_count, stream); 813 if (res != CUDA_SUCCESS) { 814 LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res); 815 return false; 816 } 817 VLOG(2) << "successfully enqueued async memset operation"; 818 return true; 819 } 820 821 /* static */ bool CUDADriver::AddStreamCallback(CudaContext* context, 822 CUstream stream, 823 StreamCallback callback, 824 void *data) { 825 // Note: flags param is required to be zero according to CUDA 6.0. 826 CUresult res = cuStreamAddCallback(stream, callback, data, 0 /* = flags */); 827 if (res != CUDA_SUCCESS) { 828 LOG(ERROR) << "unable to add host callback: " << ToString(res); 829 return false; 830 } 831 return true; 832 } 833 834 /* static */ bool CUDADriver::GetModuleFunction(CudaContext *context, 835 CUmodule module, 836 const char *kernel_name, 837 CUfunction *function) { 838 ScopedActivateContext activated{context}; 839 CHECK(module != nullptr && kernel_name != nullptr); 840 CUresult res = cuModuleGetFunction(function, module, kernel_name); 841 if (res != CUDA_SUCCESS) { 842 LOG(ERROR) << "failed to get PTX kernel \"" << kernel_name 843 << "\" from module: " << ToString(res); 844 return false; 845 } 846 847 return true; 848 } 849 850 /* static */ bool CUDADriver::GetModuleSymbol(CudaContext* context, 851 CUmodule module, 852 const char *symbol_name, 853 CUdeviceptr *dptr, 854 size_t *bytes) { 855 ScopedActivateContext activated{context}; 856 CHECK(module != nullptr && symbol_name != nullptr && 857 (dptr != nullptr || bytes != nullptr)); 858 CUresult res = cuModuleGetGlobal(dptr, bytes, module, symbol_name); 859 if (res != CUDA_SUCCESS) { 860 // symbol may not be found in the current module, but it may reside in 861 // another module. 862 VLOG(2) << "failed to get symbol \"" << symbol_name 863 << "\" from module: " << ToString(res); 864 return false; 865 } 866 867 return true; 868 } 869 870 /* static */ void CUDADriver::UnloadModule(CudaContext *context, 871 CUmodule module) { 872 ScopedActivateContext activated{context}; 873 CUresult res = cuModuleUnload(module); 874 if (res != CUDA_SUCCESS) { 875 LOG(ERROR) << "failed to unload module " << module 876 << "; leaking: " << ToString(res); 877 } 878 } 879 880 /* static */ port::StatusOr<CUdevice> CUDADriver::DeviceFromContext( 881 CudaContext* context) { 882 ScopedActivateContext activated{context}; 883 CUdevice device = -1; 884 CUresult result = cuCtxGetDevice(&device); 885 if (result == CUDA_SUCCESS) { 886 return device; 887 } 888 889 return port::Status{ 890 port::error::INTERNAL, 891 port::StrCat("failed to get device for context: ", ToString(result))}; 892 } 893 894 /* static */ bool CUDADriver::CreateStream(CudaContext *context, 895 CUstream *out) { 896 // TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess 897 // up synchronization with respect to memsets and any other things that have 898 // to occur on the default stream? 899 ScopedActivateContext activated{context}; 900 CUresult res = cuStreamCreate(out, 0); 901 if (res != CUDA_SUCCESS) { 902 LOG(ERROR) << "could not allocate CUDA stream for context " << context 903 << ": " << ToString(res); 904 return false; 905 } 906 907 VLOG(2) << "successfully created stream " << *out << " for context " 908 << context << " on thread"; 909 return true; 910 } 911 912 /* static */ void CUDADriver::DestroyStream(CudaContext* context, 913 CUstream *stream) { 914 if (*stream == nullptr) { 915 return; 916 } 917 918 ScopedActivateContext activated{context}; 919 CUresult res = cuStreamDestroy(*stream); 920 if (res != CUDA_SUCCESS) { 921 LOG(ERROR) << "failed to destroy CUDA stream for context " << context 922 << ": " << ToString(res); 923 } else { 924 VLOG(2) << "successfully destroyed stream " << *stream << " for context " 925 << context; 926 *stream = nullptr; 927 } 928 } 929 930 /* static */ void *CUDADriver::DeviceAllocate(CudaContext *context, 931 uint64 bytes) { 932 ScopedActivateContext activated{context}; 933 CUdeviceptr result = 0; 934 CUresult res = cuMemAlloc(&result, bytes); 935 if (res != CUDA_SUCCESS) { 936 LOG(ERROR) << "failed to allocate " 937 << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes 938 << " bytes) from device: " << ToString(res); 939 return nullptr; 940 } 941 void *ptr = reinterpret_cast<void *>(result); 942 VLOG(2) << "allocated " << ptr << " for context " << context << " of " 943 << bytes << " bytes"; 944 return ptr; 945 } 946 947 /* static */ void CUDADriver::DeviceDeallocate(CudaContext* context, 948 void *location) { 949 ScopedActivateContext activation{context}; 950 CUdeviceptr pointer = port::bit_cast<CUdeviceptr>(location); 951 CUresult res = cuMemFree(pointer); 952 if (res != CUDA_SUCCESS) { 953 LOG(ERROR) << "failed to free device memory at " << location 954 << "; result: " << ToString(res); 955 } else { 956 VLOG(2) << "deallocated " << location << " for context " << context; 957 } 958 } 959 960 /* static */ void *CUDADriver::HostAllocate(CudaContext *context, 961 uint64 bytes) { 962 ScopedActivateContext activation{context}; 963 void *host_mem = nullptr; 964 // "Portable" memory is visible to all CUDA contexts. Safe for our use model. 965 CUresult res = cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE); 966 if (res != CUDA_SUCCESS) { 967 LOG(ERROR) << "failed to alloc " << bytes 968 << " bytes on host: " << ToString(res); 969 } 970 return host_mem; 971 } 972 973 /* static */ void CUDADriver::HostDeallocate(CudaContext* context, 974 void *location) { 975 ScopedActivateContext activation{context}; 976 CUresult res = cuMemFreeHost(location); 977 if (res != CUDA_SUCCESS) { 978 LOG(ERROR) << "error deallocating host memory at " << location << ": " 979 << ToString(res); 980 } 981 } 982 983 /* static */ bool CUDADriver::HostRegister(CudaContext* context, void *location, 984 uint64 bytes) { 985 ScopedActivateContext activation{context}; 986 // "Portable" memory is visible to all CUDA contexts. Safe for our use model. 987 CUresult res = 988 cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE); 989 if (res != CUDA_SUCCESS) { 990 LOG(ERROR) << "error registering host memory at " << location << ": " 991 << ToString(res); 992 return false; 993 } 994 return true; 995 } 996 997 /* static */ bool CUDADriver::HostUnregister(CudaContext* context, 998 void *location) { 999 ScopedActivateContext activation{context}; 1000 CUresult res = cuMemHostUnregister(location); 1001 if (res != CUDA_SUCCESS) { 1002 LOG(ERROR) << "error unregistering host memory at " << location << ": " 1003 << ToString(res); 1004 return false; 1005 } 1006 return true; 1007 } 1008 1009 /* static */ port::Status CUDADriver::DestroyEvent(CudaContext* context, 1010 CUevent *event) { 1011 if (*event == nullptr) { 1012 return port::Status{port::error::INVALID_ARGUMENT, 1013 "input event cannot be null"}; 1014 } 1015 1016 ScopedActivateContext activated{context}; 1017 CUresult res = cuEventDestroy(*event); 1018 *event = nullptr; 1019 1020 switch (res) { 1021 case CUDA_SUCCESS: 1022 return port::Status::OK(); 1023 case CUDA_ERROR_DEINITIALIZED: 1024 case CUDA_ERROR_NOT_INITIALIZED: 1025 return port::Status{ 1026 port::error::FAILED_PRECONDITION, 1027 port::Printf("error destroying CUDA event in context %p: %s", context, 1028 ToString(res).c_str())}; 1029 default: 1030 return port::Status{ 1031 port::error::INTERNAL, 1032 port::Printf("error destroying CUDA event in context %p: %s", context, 1033 ToString(res).c_str())}; 1034 } 1035 } 1036 1037 /* static */ port::Status CUDADriver::RecordEvent(CudaContext* context, 1038 CUevent event, 1039 CUstream stream) { 1040 ScopedActivateContext activated{context}; 1041 CUresult res = cuEventRecord(event, stream); 1042 switch (res) { 1043 case CUDA_SUCCESS: 1044 return port::Status::OK(); 1045 case CUDA_ERROR_DEINITIALIZED: 1046 case CUDA_ERROR_NOT_INITIALIZED: 1047 return port::Status{ 1048 port::error::FAILED_PRECONDITION, 1049 port::Printf("error recording CUDA event on stream %p: %s", stream, 1050 ToString(res).c_str())}; 1051 default: 1052 return port::Status{ 1053 port::error::INVALID_ARGUMENT, 1054 port::Printf("error recording CUDA event on stream %p: %s", stream, 1055 ToString(res).c_str())}; 1056 } 1057 } 1058 1059 /* static */ port::StatusOr<CUresult> CUDADriver::QueryEvent( 1060 CudaContext *context, CUevent event) { 1061 ScopedActivateContext activated{context}; 1062 CUresult res = cuEventQuery(event); 1063 if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) { 1064 return port::Status{ 1065 port::error::INTERNAL, 1066 port::Printf("failed to query event: %s", ToString(res).c_str())}; 1067 } 1068 1069 return res; 1070 } 1071 1072 /* static */ bool CUDADriver::GetEventElapsedTime(CudaContext* context, 1073 float *elapsed_milliseconds, 1074 CUevent start, CUevent stop) { 1075 ScopedActivateContext activated{context}; 1076 // The stop event must have completed in order for cuEventElapsedTime to 1077 // work. 1078 CUresult res = cuEventSynchronize(stop); 1079 if (res != CUDA_SUCCESS) { 1080 LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res); 1081 return false; 1082 } 1083 res = cuEventElapsedTime(elapsed_milliseconds, start, stop); 1084 if (res != CUDA_SUCCESS) { 1085 LOG(ERROR) << "failed to get elapsed time between events: " 1086 << ToString(res); 1087 return false; 1088 } 1089 1090 return true; 1091 } 1092 1093 /* static */ bool CUDADriver::WaitStreamOnEvent(CudaContext* context, 1094 CUstream stream, 1095 CUevent event) { 1096 ScopedActivateContext activation{context}; 1097 CUresult res = cuStreamWaitEvent(stream, event, 0 /* = flags */); 1098 if (res != CUDA_SUCCESS) { 1099 LOG(ERROR) << "could not wait stream on event: " << ToString(res); 1100 return false; 1101 } 1102 1103 return true; 1104 } 1105 1106 /* static */ bool CUDADriver::SynchronizeContext(CudaContext* context) { 1107 ScopedActivateContext activation{context}; 1108 CUresult res = cuCtxSynchronize(); 1109 if (res != CUDA_SUCCESS) { 1110 LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res) 1111 << " :: " << port::CurrentStackTrace(); 1112 return false; 1113 } 1114 1115 return true; 1116 } 1117 1118 /* static */ port::Status CUDADriver::SynchronizeStream(CudaContext *context, 1119 CUstream stream) { 1120 ScopedActivateContext activated{context}; 1121 CHECK(stream != nullptr); 1122 CUresult res = cuStreamSynchronize(stream); 1123 if (res != CUDA_SUCCESS) { 1124 port::Status status = port::InternalError( 1125 port::StrCat("could not synchronize on CUDA stream: ", ToString(res))); 1126 LOG(ERROR) << status << " :: " << port::CurrentStackTrace(); 1127 return status; 1128 } 1129 VLOG(2) << "successfully synchronized stream " << stream << " on context " 1130 << context; 1131 return port::Status::OK(); 1132 } 1133 1134 /* static */ bool CUDADriver::IsStreamIdle(CudaContext *context, 1135 CUstream stream) { 1136 ScopedActivateContext activated{context}; 1137 CHECK(stream != nullptr); 1138 CUresult res = cuStreamQuery(stream); 1139 if (res == CUDA_SUCCESS) { 1140 return true; 1141 } 1142 1143 if (res != CUDA_ERROR_NOT_READY) { 1144 LOG(ERROR) << "stream in bad state on status query: " << ToString(res); 1145 } 1146 return false; 1147 } 1148 1149 /* static */ port::Status CUDADriver::SynchronousMemcpyD2H(CudaContext *context, 1150 void *host_dst, 1151 CUdeviceptr gpu_src, 1152 uint64 size) { 1153 ScopedActivateContext activation{context}; 1154 CUresult res = cuMemcpyDtoH(host_dst, gpu_src, size); 1155 if (res != CUDA_SUCCESS) { 1156 return port::InternalError( 1157 port::Printf("failed to synchronous memcpy from device to host: %s; " 1158 "host dst: %p; GPU src: %p; size: %llu=0x%llx", 1159 ToString(res).c_str(), host_dst, 1160 port::bit_cast<void *>(gpu_src), size, size)); 1161 } 1162 VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to " 1163 << host_dst; 1164 return port::Status::OK(); 1165 } 1166 1167 /* static */ port::Status CUDADriver::SynchronousMemcpyH2D(CudaContext *context, 1168 CUdeviceptr gpu_dst, 1169 const void *host_src, 1170 uint64 size) { 1171 ScopedActivateContext activation{context}; 1172 CUresult res = cuMemcpyHtoD(gpu_dst, host_src, size); 1173 if (res != CUDA_SUCCESS) { 1174 return port::InternalError(port::Printf( 1175 "failed to synchronous memcpy from host to device: %s; GPU dst: %p;" 1176 " host src: %p; size: %llu=0x%llx", 1177 ToString(res).c_str(), port::bit_cast<void *>(gpu_dst), host_src, size, 1178 size)); 1179 } 1180 VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes"; 1181 return port::Status::OK(); 1182 } 1183 1184 /* static */ port::Status CUDADriver::SynchronousMemcpyD2D(CudaContext *context, 1185 CUdeviceptr gpu_dst, 1186 CUdeviceptr gpu_src, 1187 uint64 size) { 1188 ScopedActivateContext activation{context}; 1189 CUresult res = cuMemcpyDtoD(gpu_dst, gpu_src, size); 1190 if (res != CUDA_SUCCESS) { 1191 return port::InternalError(port::Printf( 1192 "failed to synchronous memcpy from host to device: %s; GPU dst: %p; " 1193 "GPU src: %p; size: %llu=0x%llx", 1194 ToString(res).c_str(), port::bit_cast<void *>(gpu_dst), 1195 port::bit_cast<void *>(gpu_src), size, size)); 1196 } 1197 VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes"; 1198 return port::Status::OK(); 1199 } 1200 1201 /* static */ bool CUDADriver::AsynchronousMemcpyD2H(CudaContext* context, 1202 void *host_dst, 1203 CUdeviceptr gpu_src, 1204 uint64 size, 1205 CUstream stream) { 1206 ScopedActivateContext activation{context}; 1207 CUresult res = cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream); 1208 if (res != CUDA_SUCCESS) { 1209 LOG(ERROR) << port::Printf( 1210 "failed to enqueue async memcpy from device to host: %s; host dst: %p; " 1211 "GPU src: %p; size: %llu=0x%llx", 1212 ToString(res).c_str(), host_dst, port::bit_cast<void *>(gpu_src), size, size); 1213 return false; 1214 } 1215 VLOG(2) << "successfully enqueued async memcpy d2h of " << size 1216 << " bytes from " << port::bit_cast<void *>(gpu_src) << " to " << host_dst 1217 << " on stream " << stream; 1218 return true; 1219 } 1220 1221 /* static */ bool CUDADriver::AsynchronousMemcpyH2D(CudaContext* context, 1222 CUdeviceptr gpu_dst, 1223 const void *host_src, 1224 uint64 size, 1225 CUstream stream) { 1226 ScopedActivateContext activation{context}; 1227 CUresult res = cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream); 1228 if (res != CUDA_SUCCESS) { 1229 LOG(ERROR) << port::Printf( 1230 "failed to enqueue async memcpy from host to device: %s; GPU dst: %p; " 1231 "host src: %p; size: %llu=0x%llx", 1232 ToString(res).c_str(), port::bit_cast<void *>(gpu_dst), host_src, size, size); 1233 return false; 1234 } 1235 VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes" 1236 << " on stream " << stream; 1237 return true; 1238 } 1239 1240 /* static */ bool CUDADriver::AsynchronousMemcpyD2D(CudaContext* context, 1241 CUdeviceptr gpu_dst, 1242 CUdeviceptr gpu_src, 1243 uint64 size, 1244 CUstream stream) { 1245 ScopedActivateContext activation{context}; 1246 CUresult result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream); 1247 if (result != CUDA_SUCCESS) { 1248 LOG(ERROR) << port::Printf( 1249 "failed to enqueue async memcpy from device to device: %s" 1250 "; GPU dst: %p on %s %s" 1251 "; GPU src: %p on %s %s" 1252 "; can access? %s; size: %llu=0x%llx", 1253 ToString(result).c_str(), port::bit_cast<void *>(gpu_dst), 1254 CUDAPointerToMemorySpaceString(gpu_dst).c_str(), 1255 CUDAPointerToDeviceString(gpu_dst).c_str(), port::bit_cast<void *>(gpu_src), 1256 CUDAPointerToMemorySpaceString(gpu_src).c_str(), 1257 CUDAPointerToDeviceString(gpu_src).c_str(), 1258 CUDAPointersToCanAccessString(gpu_src, gpu_dst).c_str(), size, size); 1259 1260 return false; 1261 } 1262 VLOG(2) << "successfully enqueued async memcpy d2d of " << size << " bytes"; 1263 return true; 1264 } 1265 1266 /* static */ port::Status CUDADriver::CreateEvent(CudaContext* context, 1267 CUevent *result, 1268 EventFlags flags) { 1269 int cuflags; 1270 switch (flags) { 1271 case EventFlags::kDefault: 1272 cuflags = CU_EVENT_DEFAULT; 1273 break; 1274 case EventFlags::kDisableTiming: 1275 cuflags = CU_EVENT_DISABLE_TIMING; 1276 break; 1277 default: 1278 LOG(FATAL) << "impossible event flags: " << int(flags); 1279 } 1280 1281 ScopedActivateContext activated{context}; 1282 CUresult res = cuEventCreate(result, cuflags); 1283 1284 if (res == CUDA_SUCCESS) { 1285 return port::Status::OK(); 1286 } else if (res == CUDA_ERROR_OUT_OF_MEMORY) { 1287 return port::Status{port::error::RESOURCE_EXHAUSTED, 1288 "could not create CUDA event: out of device memory"}; 1289 } else { 1290 return port::Status{ 1291 port::error::FAILED_PRECONDITION, 1292 port::StrCat("could not create CUDA event: ", ToString(res))}; 1293 } 1294 } 1295 1296 /* static */ int CUDADriver::GetDeviceCount() { 1297 int device_count = 0; 1298 CUresult res = cuDeviceGetCount(&device_count); 1299 if (res != CUDA_SUCCESS) { 1300 LOG(ERROR) << "could not retrieve CUDA device count: " << ToString(res); 1301 return 0; 1302 } 1303 1304 if (FLAGS_gpuexec_cuda_device_0_only && device_count > 1) { 1305 device_count = 1; 1306 } 1307 return device_count; 1308 } 1309 1310 /* static */ port::StatusOr<CudaContext*> CUDADriver::GetPointerContext( 1311 CUdeviceptr pointer) { 1312 CudaContext* context = nullptr; 1313 CUresult result = 1314 cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer); 1315 if (result == CUDA_SUCCESS) { 1316 CHECK(context != nullptr) << "success should entail non-null context"; 1317 return context; 1318 } 1319 1320 return port::Status{ 1321 port::error::INTERNAL, 1322 port::StrCat("failed to query device pointer for context: ", 1323 ToString(result))}; 1324 } 1325 1326 /* static */ port::StatusOr<MemorySpace> CUDADriver::GetPointerMemorySpace( 1327 CUdeviceptr pointer) { 1328 unsigned int value; 1329 CUresult result = 1330 cuPointerGetAttribute(&value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer); 1331 if (result == CUDA_SUCCESS) { 1332 switch (value) { 1333 case CU_MEMORYTYPE_DEVICE: 1334 return MemorySpace::kDevice; 1335 case CU_MEMORYTYPE_HOST: 1336 return MemorySpace::kHost; 1337 default: 1338 return port::Status{ 1339 port::error::INTERNAL, 1340 port::StrCat("unknown memory space provided by CUDA API: ", value)}; 1341 } 1342 } 1343 1344 return port::Status{ 1345 port::error::INTERNAL, 1346 port::StrCat("failed to query device pointer for memory space: ", 1347 ToString(result))}; 1348 } 1349 1350 /* static */ port::Status CUDADriver::GetPointerAddressRange(CUdeviceptr dptr, 1351 CUdeviceptr *base, 1352 size_t *size) { 1353 CUresult result = cuMemGetAddressRange(base, size, dptr); 1354 if (result == CUDA_SUCCESS) { 1355 return port::Status::OK(); 1356 } else if (result == CUDA_ERROR_NOT_FOUND) { 1357 // We differentiate between "this pointer is unknown" (return here) and 1358 // "there was an internal error while performing this operation" (return 1359 // below). 1360 return port::Status{ 1361 port::error::NOT_FOUND, 1362 port::Printf("not a device pointer %p; %s", 1363 reinterpret_cast<void *>(dptr), ToString(result).c_str())}; 1364 } 1365 1366 return port::Status{ 1367 port::error::INTERNAL, 1368 port::Printf("failed to get pointer into for device pointer %p; %s", 1369 reinterpret_cast<void *>(dptr), ToString(result).c_str())}; 1370 } 1371 1372 /* static */ port::StatusOr<CUdevice> CUDADriver::GetPointerDevice( 1373 CUdeviceptr pointer) { 1374 auto result = GetPointerContext(pointer); 1375 if (!result.ok()) { 1376 return result.status(); 1377 } 1378 1379 return DeviceFromContext(result.ValueOrDie()); 1380 } 1381 1382 /* static */ port::Status CUDADriver::GetComputeCapability(int *cc_major, 1383 int *cc_minor, 1384 CUdevice device) { 1385 *cc_major = 0; 1386 *cc_minor = 0; 1387 CUresult result = cuDeviceComputeCapability(cc_major, cc_minor, device); 1388 if (result == CUDA_SUCCESS) { 1389 return port::Status::OK(); 1390 } 1391 1392 return port::Status{ 1393 port::error::INTERNAL, 1394 port::Printf("failed to get compute capability for device: %s; %d", 1395 ToString(result).c_str(), device)}; 1396 } 1397 1398 // Helper function that turns the integer output of cuDeviceGetAttribute to type 1399 // T and wraps it in a StatusOr. 1400 template <typename T> 1401 static port::StatusOr<T> GetSimpleAttribute(CUdevice device, 1402 CUdevice_attribute attribute) { 1403 int value = -1; 1404 CUresult result = cuDeviceGetAttribute(&value, attribute, device); 1405 if (result != CUDA_SUCCESS) { 1406 return port::Status{ 1407 port::error::NOT_FOUND, 1408 port::StrCat("could not retrieve CUDA device attribute (", attribute, 1409 "): ", ToString(result))}; 1410 } 1411 T converted = value; 1412 return converted; 1413 } 1414 1415 /* static */ port::StatusOr<int> CUDADriver::GetMultiprocessorCount( 1416 CUdevice device) { 1417 return GetSimpleAttribute<int>(device, 1418 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT); 1419 } 1420 1421 /* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerCore( 1422 CUdevice device) { 1423 return GetSimpleAttribute<int64>( 1424 device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR); 1425 } 1426 1427 /* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerBlock( 1428 CUdevice device) { 1429 return GetSimpleAttribute<int64>( 1430 device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK); 1431 } 1432 1433 /* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerMultiprocessor( 1434 CUdevice device) { 1435 return GetSimpleAttribute<int64>( 1436 device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR); 1437 } 1438 1439 /* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerBlock( 1440 CUdevice device) { 1441 return GetSimpleAttribute<int64>(device, 1442 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK); 1443 } 1444 1445 /* static */ port::StatusOr<int64> CUDADriver::GetMaxRegistersPerBlock( 1446 CUdevice device) { 1447 return GetSimpleAttribute<int64>(device, 1448 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK); 1449 } 1450 1451 /* static */ port::StatusOr<int64> CUDADriver::GetThreadsPerWarp( 1452 CUdevice device) { 1453 return GetSimpleAttribute<int64>(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE); 1454 } 1455 1456 /* static */ bool CUDADriver::GetGridLimits(int *x, int *y, int *z, 1457 CUdevice device) { 1458 int value; 1459 CUresult res = 1460 cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device); 1461 if (res != CUDA_SUCCESS) { 1462 LOG(ERROR) << "failed to query max grid dim x: " << ToString(res); 1463 return false; 1464 } 1465 *x = value; 1466 1467 res = 1468 cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device); 1469 if (res != CUDA_SUCCESS) { 1470 LOG(ERROR) << "failed to query max grid dim y: " << ToString(res); 1471 return false; 1472 } 1473 *y = value; 1474 1475 res = 1476 cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device); 1477 if (res != CUDA_SUCCESS) { 1478 LOG(ERROR) << "failed to query max grid dim z: " << ToString(res); 1479 return false; 1480 } 1481 *z = value; 1482 return true; 1483 } 1484 1485 /* static */ bool CUDADriver::GetDriverVersion(int *driver_version) { 1486 CUresult res = cuDriverGetVersion(driver_version); 1487 if (res != CUDA_SUCCESS) { 1488 LOG(ERROR) << "failed to query driver version: " << ToString(res); 1489 return false; 1490 } 1491 1492 return true; 1493 } 1494 1495 /* static */ bool CUDADriver::GetDeviceProperties(CUdevprop *device_properties, 1496 int device_ordinal) { 1497 CUresult res = cuDeviceGetProperties(device_properties, device_ordinal); 1498 if (res != CUDA_SUCCESS) { 1499 LOG(ERROR) << "failed to query device properties: " << ToString(res); 1500 return false; 1501 } 1502 1503 return true; 1504 } 1505 1506 /* static */ bool CUDADriver::IsEccEnabled(CUdevice device, bool *result) { 1507 int value = -1; 1508 CUresult res = 1509 cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device); 1510 if (res != CUDA_SUCCESS) { 1511 LOG(ERROR) << "failed to query ECC status: " << ToString(res); 1512 return false; 1513 } 1514 1515 *result = value; 1516 return true; 1517 } 1518 1519 /* static */ bool CUDADriver::GetDeviceMemoryInfo(CudaContext* context, 1520 int64 *free_out, 1521 int64 *total_out) { 1522 ScopedActivateContext activation{context}; 1523 size_t free = 0; 1524 size_t total = 0; 1525 CUresult res = cuMemGetInfo(&free, &total); 1526 if (res != CUDA_SUCCESS) { 1527 LOG(ERROR) << "failed to query device memory info: " << ToString(res); 1528 return false; 1529 } 1530 1531 *free_out = free; 1532 *total_out = total; 1533 return true; 1534 } 1535 1536 /* static */ bool CUDADriver::GetDeviceTotalMemory(CUdevice device, 1537 uint64 *result) { 1538 size_t value = -1; 1539 CUresult res = cuDeviceTotalMem(&value, device); 1540 if (res != CUDA_SUCCESS) { 1541 LOG(ERROR) << "failed to query total available memory: " << ToString(res); 1542 return false; 1543 } 1544 1545 *result = value; 1546 return true; 1547 } 1548 1549 /* static */ string CUDADriver::GetPCIBusID(CUdevice device) { 1550 string pci_bus_id; 1551 static const int kBufferSize = 64; 1552 port::InlinedVector<char, 4> chars(kBufferSize); 1553 chars[kBufferSize - 1] = '\0'; 1554 CUresult res = cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device); 1555 if (res != CUDA_SUCCESS) { 1556 LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res); 1557 return pci_bus_id; 1558 } 1559 pci_bus_id = chars.begin(); 1560 return pci_bus_id; 1561 } 1562 1563 /* static */ bool CUDADriver::CanEnablePeerAccess(CudaContext* from, 1564 CudaContext* to) { 1565 if (from == to) { 1566 return true; // A context can always access its own memory. 1567 } 1568 1569 int can_access_peer = -1; 1570 auto from_device = DeviceFromContext(from); 1571 if (!from_device.ok()) { 1572 LOG(ERROR) << "failed to resolve 'from' peer access context to a device: " 1573 << from_device.status(); 1574 return false; 1575 } 1576 auto to_device = DeviceFromContext(to); 1577 if (!to_device.ok()) { 1578 LOG(ERROR) << "failed to resolve 'to' peer access context to a device: " 1579 << to_device.status(); 1580 return false; 1581 } 1582 CUresult res = cuDeviceCanAccessPeer( 1583 &can_access_peer, from_device.ValueOrDie(), to_device.ValueOrDie()); 1584 if (res != CUDA_SUCCESS) { 1585 LOG(ERROR) << "failed to detect peer access capability: " << ToString(res); 1586 return false; 1587 } 1588 1589 return can_access_peer; 1590 } 1591 1592 /* static */ port::Status CUDADriver::EnablePeerAccess(CudaContext* from, 1593 CudaContext* to) { 1594 if (from == to) { 1595 return port::Status::OK(); // A context can always access its own memory. 1596 } 1597 1598 ScopedActivateContext activated{from}; 1599 CUresult result = cuCtxEnablePeerAccess(to->context(), 0 /* = flags */); 1600 if (result != CUDA_SUCCESS && 1601 result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) { 1602 return port::Status{ 1603 port::error::INTERNAL, 1604 port::Printf("failed to enable peer access from %p to %p: %s", from, to, 1605 ToString(result).c_str())}; 1606 } 1607 1608 return port::Status::OK(); 1609 } 1610 1611 /* static */ port::StatusOr<int> CUDADriver::GetMaxOccupiedBlocksPerCore( 1612 CudaContext* context, CUfunction kernel, int threads_per_block, 1613 size_t dynamic_shared_memory_bytes) { 1614 ScopedActivateContext activation{context}; 1615 1616 int max_blocks; 1617 CUresult result = cuOccupancyMaxActiveBlocksPerMultiprocessor( 1618 &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes); 1619 if (result != CUDA_SUCCESS) { 1620 return port::Status{ 1621 port::error::INTERNAL, 1622 port::Printf("failed to calculate occupancy of kernel %p: %s", kernel, 1623 ToString(result).c_str())}; 1624 } 1625 1626 return max_blocks; 1627 } 1628 1629 /* static */ CUcontext CUDADriver::CurrentContextOrDie() { 1630 CUcontext current = nullptr; 1631 CUresult result = cuCtxGetCurrent(¤t); 1632 if (result != CUDA_SUCCESS) { 1633 LOG(FATAL) << "failed to query current context: " << ToString(result); 1634 } 1635 return current; 1636 } 1637 1638 } // namespace cuda 1639 } // namespace gputools 1640 } // namespace perftools 1641