Home | History | Annotate | Download | only in cuda
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
     17 
     18 #include <stdint.h>
     19 #include <stdlib.h>
     20 #include <map>
     21 #include <set>
     22 #include <utility>
     23 
     24 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
     25 #include "tensorflow/stream_executor/lib/casts.h"
     26 #include "tensorflow/stream_executor/lib/env.h"
     27 #include "tensorflow/stream_executor/lib/error.h"
     28 #include "tensorflow/stream_executor/lib/human_readable.h"
     29 #include "tensorflow/stream_executor/lib/notification.h"
     30 #include "tensorflow/stream_executor/lib/threadpool.h"
     31 #include "tensorflow/stream_executor/lib/stacktrace.h"
     32 #include "tensorflow/stream_executor/lib/static_threadlocal.h"
     33 #include "tensorflow/stream_executor/lib/strcat.h"
     34 #include "tensorflow/stream_executor/lib/stringprintf.h"
     35 #include "tensorflow/stream_executor/platform/logging.h"
     36 #include "tensorflow/stream_executor/platform/mutex.h"
     37 #include "tensorflow/stream_executor/platform/port.h"
     38 #include "tensorflow/stream_executor/lib/inlined_vector.h"
     39 
     40 #if defined(PLATFORM_WINDOWS)
     41 // TODO: in windows ARRAYSIZE is defined in winnt.h but including it
     42 //  here creates a conflict with cuda.h - for now define it here.
     43 #define ARRAYSIZE(a) \
     44   ((sizeof(a) / sizeof(*(a))) / \
     45   static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
     46 #endif
     47 
     48 bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
     49 bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false;
     50 bool FLAGS_gpuexec_cuda_device_0_only = false;
     51 
     52 // Debugging: on each push and pop of a cuda context, verify the current context
     53 // matches the expected one.
     54 constexpr bool kVerifyCudaContext = false;
     55 
     56 namespace perftools {
     57 namespace gputools {
     58 namespace cuda {
     59 
     60 namespace {
     61 
     62 // Manages the singleton map of contexts that we've created, mapping
     63 // from the CUcontext to the CudaContext* that we pass around internally.
     64 // This also manages assignment of unique ids to CudaContexts, to allow
     65 // for fast comparison of a context against the current context.
     66 //
     67 // CUDA-runtime-created contexts are avoided, if triple angle
     68 // brace launches are required, by using the scoped activations in
     69 // cuda_activation.h.
     70 class CreatedContexts {
     71  public:
     72   // Returns whether context is a member of the live set.
     73   static bool Has(CUcontext context) {
     74     tf_shared_lock lock{mu_};
     75     return Live()->find(context) != Live()->end();
     76   }
     77 
     78   // Adds context to the live set.
     79   static CudaContext* Add(CUcontext context) {
     80     CHECK(context != nullptr);
     81     mutex_lock lock{mu_};
     82     auto cuda_context = new CudaContext(context, next_id_++);
     83     Live()->insert(
     84         std::make_pair(context, std::unique_ptr<CudaContext>(cuda_context)));
     85     return cuda_context;
     86   }
     87 
     88   // Removes context from the live set.
     89   static void Remove(CUcontext context) {
     90     CHECK(context != nullptr);
     91     mutex_lock lock{mu_};
     92     auto it = Live()->find(context);
     93     CHECK(it != Live()->end()) << context;
     94     Live()->erase(it);
     95   }
     96 
     97  private:
     98   // Returns the live map singleton.
     99   static std::map<CUcontext, std::unique_ptr<CudaContext>> *Live() {
    100     static auto singleton =
    101         new std::map<CUcontext, std::unique_ptr<CudaContext>>;
    102     return singleton;
    103   }
    104 
    105   // Lock that guards access-to/mutation-of the live set.
    106   static mutex mu_;
    107   static int64 next_id_;
    108 };
    109 
    110 /* static */ mutex CreatedContexts::mu_{LINKER_INITIALIZED};
    111 /* static */ int64 CreatedContexts::next_id_ = 1;  // 0 means "no context"
    112 
    113 // Formats CUresult to output prettified values into a log stream.
    114 // Error summaries taken from:
    115 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc6c391505e117393cc2558fff6bfc2e9
    116 //
    117 // TODO(leary) switch to cuGetErrorName when updated cuda.h is available.
    118 string ToString(CUresult result) {
    119 #define OSTREAM_CUDA_ERROR(__name) \
    120   case CUDA_ERROR_##__name:        \
    121     return "CUDA_ERROR_" #__name;
    122 
    123 ///////////////
    124 // NOTE: here we specify return code values outside of the enum explicitly
    125 // because our in-tree cuda.h is from the CUDA 5.5 SDK, but CUDA 6.0+ driver
    126 // libraries are deployed in the fleet these error codes are backwards
    127 // compatible, but if we see a "new" one, we want to be able to identify it in
    128 // the logs.
    129 //
    130 // Once we get a cuda.h that has cuGetErrorName (TODO is above) we can
    131 // eliminate this function and just rely on the driver to provide us these
    132 // strings.
    133 //
    134 // NOTE: "Must reboot all context" below is shorthand for, "must
    135 // destroy/recreate the offending context and any allocation which come from
    136 // it if you are to continue using CUDA."
    137 #pragma GCC diagnostic push
    138 #pragma GCC diagnostic ignored "-Wswitch"
    139   switch (result) {
    140     OSTREAM_CUDA_ERROR(INVALID_VALUE)
    141     OSTREAM_CUDA_ERROR(OUT_OF_MEMORY)
    142     OSTREAM_CUDA_ERROR(NOT_INITIALIZED)
    143     OSTREAM_CUDA_ERROR(DEINITIALIZED)
    144     OSTREAM_CUDA_ERROR(NO_DEVICE)
    145     OSTREAM_CUDA_ERROR(INVALID_DEVICE)
    146     OSTREAM_CUDA_ERROR(INVALID_IMAGE)
    147     OSTREAM_CUDA_ERROR(INVALID_CONTEXT)
    148     OSTREAM_CUDA_ERROR(INVALID_HANDLE)
    149     OSTREAM_CUDA_ERROR(NOT_FOUND)
    150     OSTREAM_CUDA_ERROR(NOT_READY)
    151     OSTREAM_CUDA_ERROR(NO_BINARY_FOR_GPU)
    152 
    153     // Encountered an uncorrectable ECC error during execution.
    154     OSTREAM_CUDA_ERROR(ECC_UNCORRECTABLE)
    155 
    156     // Load/store on an invalid address. Must reboot all context.
    157     case 700:
    158       return "CUDA_ERROR_ILLEGAL_ADDRESS";
    159     // Passed too many / wrong arguments, too many threads for register count.
    160     case 701:
    161       return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
    162     // Kernel took too long to execute.
    163     case 702:
    164       return "CUDA_ERROR_LAUNCH_TIMEOUT";
    165     // Kernel launch uses an incompatible texturing mode.
    166     case 703:
    167       return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
    168     // Trying to re-enable peer access that already has it enabled.
    169     case 704:
    170       return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
    171     // Trying to disable peer access that has not yet been enabled.
    172     case 705:
    173       return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
    174     // Primary context for the specified device has already been initialized.
    175     case 708:
    176       return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
    177     // Context current to calling thread has been destroyed or is a primary
    178     // context that has not yet been initialized.
    179     case 709:
    180       return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
    181     // Device-side assert triggered during kernel execution. Must reboot all
    182     // context.
    183     case 710:
    184       return "CUDA_ERROR_ASSERT";
    185     // Hardware resources to enable peer access have been exhausted.
    186     case 711:
    187       return "CUDA_ERROR_TOO_MANY_PEERS";
    188     // Memory range has already been registered.
    189     case 712:
    190       return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
    191     // Pointer does not correspond to any currently registered memory region.
    192     case 713:
    193       return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
    194     // Due to stack corruption or exceeding stack size limit. Must reboot all
    195     // context.
    196     case 714:
    197       return "CUDA_ERROR_HARDWARE_STACK_ERROR";
    198     case 715:
    199       return "CUDA_ERROR_ILLEGAL_INSTRUCTION";
    200     // Load/store on an unaligned memory address. Must reboot all context.
    201     case 716:
    202       return "CUDA_ERROR_MISALIGNED_ADDRESS";
    203     // Device instruction with specific address space given address not
    204     // belonging to allowed address space. Must reboot all context.
    205     case 717:
    206       return "CUDA_ERROR_INVALID_ADDRESS_SPACE";
    207     // Device program counter wrapped its address space. Must reboot all
    208     // context.
    209     case 718:
    210       return "CUDA_ERROR_INVALID_PC";
    211     // Exception on device while executing a kernel; e.g. deref invalid device
    212     // pointer, accessing OOB shared memory. Must reboot all context.
    213     case 719:
    214       return "CUDA_ERROR_LAUNCH_FAILED";
    215 
    216     OSTREAM_CUDA_ERROR(CONTEXT_ALREADY_IN_USE)
    217     OSTREAM_CUDA_ERROR(PEER_ACCESS_UNSUPPORTED)
    218     OSTREAM_CUDA_ERROR(NOT_PERMITTED)
    219     OSTREAM_CUDA_ERROR(NOT_SUPPORTED)
    220     OSTREAM_CUDA_ERROR(UNKNOWN)  // Unknown internal error to CUDA.
    221     default:
    222       return port::StrCat("CUresult(", static_cast<int>(result), ")");
    223   }
    224 #pragma GCC diagnostic pop
    225 }
    226 
    227 // Returns the current context and checks that it is in the set of CUDA contexts
    228 // created by StreamExecutor (to ensure that the CUDA runtime didn't create a
    229 // context behind our backs).
    230 CUcontext CurrentContext() {
    231   CUcontext current = CUDADriver::CurrentContextOrDie();
    232   if (current != nullptr && !CreatedContexts::Has(current)) {
    233     LOG(FATAL) << "current context was not created by the StreamExecutor "
    234                   "cuda_driver API: "
    235                << current
    236                << "; a CUDA runtime call "
    237                   "was likely performed without using a StreamExecutor context";
    238   }
    239   return current;
    240 }
    241 
    242 // CUDA driver routines may require a large amount of stack (particularly
    243 // cuModuleLoadDataEx, in our experience). To avoid stack overflow when using
    244 // stack-limited threads (such as those spawned by a default-argument
    245 // thread::ThreadPool on some platforms), we run certain routines in this pool
    246 // and wait for completion.
    247 static mutex driver_executor_threadpool_mu(LINKER_INITIALIZED);
    248 static port::ThreadPool *InitializeDriverExecutor() {
    249   return new port::ThreadPool(port::Env::Default(), port::ThreadOptions(),
    250                               "cuda_driver", 1);
    251 }
    252 
    253 port::ThreadPool *GetDriverExecutor() {
    254   mutex_lock lock(driver_executor_threadpool_mu);
    255   static port::ThreadPool *thread_pool = InitializeDriverExecutor();
    256   return thread_pool;
    257 }
    258 
    259 }  // namespace
    260 
    261 string MemorySpaceString(MemorySpace memory_space) {
    262   switch (memory_space) {
    263     case MemorySpace::kHost:
    264       return "host";
    265     case MemorySpace::kDevice:
    266       return "device";
    267     default:
    268       LOG(FATAL) << "impossible memory space";
    269   }
    270 }
    271 
    272 namespace {
    273 
    274 // Call cuCtxtSynchronize and crash if it doesn't succeed.
    275 void SynchronizeOrDie() {
    276   auto res = cuCtxSynchronize();
    277   if (res != CUDA_SUCCESS) {
    278     LOG(FATAL) << "Synchronize found "
    279                << ToString(res) << " :: " << port::CurrentStackTrace();
    280   }
    281 }
    282 
    283 struct ThreadLocalData {
    284   int64 id;
    285   CudaContext* context;  // Only valid if id == a known good context.
    286   int depth;
    287 };
    288 
    289 SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
    290 
    291 }  // namespace
    292 
    293 ScopedActivateContext::ScopedActivateContext(CudaContext* cuda_context) {
    294   if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
    295 
    296   auto* tls = &tls_data.get();
    297   tls->depth++;
    298   if (tls->id == cuda_context->id()) {
    299     if (kVerifyCudaContext) {
    300       CHECK_EQ(CurrentContext(), cuda_context->context());
    301     }
    302     DCHECK_EQ(CurrentContext(), cuda_context->context());
    303     return;
    304   }
    305 
    306   VLOG(3) << "ScopedActivateContext switching context from " << tls->id
    307           << " to " << cuda_context->id();
    308 
    309   to_restore_ = (tls->depth == 1 ? nullptr : tls->context);
    310 
    311   // Set the context and update thread local.
    312   CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(cuda_context->context()));
    313   tls->id = cuda_context->id();
    314   tls->context = cuda_context;
    315 }
    316 
    317 ScopedActivateContext::~ScopedActivateContext() {
    318   if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
    319 
    320   auto* tls = &tls_data.get();
    321 
    322   if (kVerifyCudaContext) {
    323     // Note that if kVerifyCudaContext is used, and contexts are deleted, it's
    324     // possible this could fail in the CurrentContext() call.
    325     CHECK_EQ(CurrentContext(),
    326              tls->context == nullptr ? nullptr : tls->context->context());
    327   }
    328 
    329   tls->depth--;
    330   DCHECK_GE(tls->depth, 0);
    331   if (to_restore_ == nullptr) {
    332     // Leave context, tls->id, and tls->context set.
    333     return;
    334   }
    335 
    336   // Set context and update thread local.
    337   CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(to_restore_->context()));
    338   tls->id = to_restore_->id();
    339   tls->context = to_restore_;
    340 }
    341 
    342 namespace {
    343 
    344 // Returns a stringified device number associated with pointer, primarily for
    345 // logging purposes. Returns "?" if the device could not be successfully
    346 // queried.
    347 string CUDAPointerToDeviceString(CUdeviceptr pointer) {
    348   auto value = CUDADriver::GetPointerDevice(pointer);
    349   if (value.ok()) {
    350     return port::StrCat(value.ValueOrDie());
    351   }
    352   LOG(ERROR) << "could not query device: " << value.status();
    353   return "?";
    354 }
    355 
    356 // Returns a stringified memory space associated with pointer, primarily for
    357 // logging purposes. Returns "?" if the memory space could not be successfully
    358 // queried.
    359 string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) {
    360   auto value = CUDADriver::GetPointerMemorySpace(pointer);
    361   if (value.ok()) {
    362     return MemorySpaceString(value.ValueOrDie());
    363   }
    364   LOG(ERROR) << "could not query device: " << value.status();
    365   return "?";
    366 }
    367 
    368 // Returns a stringified representation of whether or not peer access is
    369 // permitted between the "from" and "to" pointers' associated contexts,
    370 // primarily for logging purposes. Returns "error" if an error is encountered
    371 // in the process of querying.
    372 string CUDAPointersToCanAccessString(CUdeviceptr from, CUdeviceptr to) {
    373   auto from_context = CUDADriver::GetPointerContext(from);
    374   if (!from_context.ok()) {
    375     LOG(ERROR) << "could not retrieve source pointer's context: "
    376                << from_context.status();
    377     return "error";
    378   }
    379   auto to_context = CUDADriver::GetPointerContext(to);
    380   if (!to_context.ok()) {
    381     LOG(ERROR) << "could not retrieve destination pointer's context: "
    382                << to_context.status();
    383     return "error";
    384   }
    385   return CUDADriver::CanEnablePeerAccess(from_context.ValueOrDie(),
    386                                          to_context.ValueOrDie())
    387              ? "true"
    388              : "false";
    389 }
    390 
    391 
    392 // Actually performs the work of CUDA initialization. Wrapped up in one-time
    393 // execution guard.
    394 static port::Status InternalInit() {
    395   CUresult res = CUDA_ERROR_NO_DEVICE;
    396   if (FLAGS_gpuexec_cuda_driver_inject_init_error) {
    397     LOG(ERROR) << "injecting CUDA init error; initialization will fail";
    398   } else {
    399     res = cuInit(0 /* = flags */);
    400   }
    401 
    402   if (res == CUDA_SUCCESS) {
    403     return port::Status::OK();
    404   }
    405 
    406   LOG(ERROR) << "failed call to cuInit: " << ToString(res);
    407   Diagnostician::LogDiagnosticInformation();
    408   return port::Status{port::error::ABORTED,
    409                       port::StrCat("failed call to cuInit: ", ToString(res))};
    410 }
    411 
    412 }  // namespace
    413 
    414 /* static */ port::Status CUDADriver::Init() {
    415   // Cached return value from calling InternalInit(), as cuInit need only be
    416   // called once, but CUDADriver::Init may be called many times.
    417   static port::Status init_retval;
    418   static bool set = false;
    419   static mutex *init_mu = new mutex;
    420 
    421   mutex_lock lock(*init_mu);
    422   if (!set) {
    423     init_retval = InternalInit();
    424     set = true;
    425   }
    426 
    427   return init_retval;
    428 }
    429 
    430 /* static */ port::Status CUDADriver::GetDevice(int device_ordinal,
    431                                                 CUdevice *device) {
    432   CUresult res = cuDeviceGet(device, device_ordinal);
    433   if (res == CUDA_SUCCESS) {
    434     return port::Status::OK();
    435   }
    436 
    437   return port::Status{
    438       port::error::INTERNAL,
    439       port::StrCat("failed call to cuDeviceGet: ", ToString(res))};
    440 }
    441 
    442 /* static */ bool CUDADriver::GetDeviceName(CUdevice device,
    443                                             string *device_name) {
    444   static const size_t kCharLimit = 64;
    445   port::InlinedVector<char, 4> chars(kCharLimit);
    446   CUresult res = cuDeviceGetName(chars.begin(), kCharLimit - 1, device);
    447   if (res != CUDA_SUCCESS) {
    448     LOG(ERROR) << "failed to get device name for " << device << ": "
    449                << ToString(res);
    450     return false;
    451   }
    452   chars[kCharLimit - 1] = '\0';
    453   *device_name = chars.begin();
    454   return true;
    455 }
    456 
    457 bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
    458                                  int *flags) {
    459   static_assert(DeviceOptions::kMask == 0xf,
    460                 "needs update for new device options");
    461 
    462   if (device_options.flags() & DeviceOptions::kDoNotReclaimStackAllocation) {
    463     *flags |= CU_CTX_LMEM_RESIZE_TO_MAX;
    464   }
    465 
    466   // If no flags are set the default is CU_CTX_SCHED_AUTO, which
    467   // in Google environments is very likely to mean SPIN.
    468   if (device_options.flags() & DeviceOptions::kScheduleSpin) {
    469     *flags |= CU_CTX_SCHED_SPIN;
    470   }
    471   if (device_options.flags() & DeviceOptions::kScheduleYield) {
    472     *flags |= CU_CTX_SCHED_YIELD;
    473   }
    474   if (device_options.flags() & DeviceOptions::kScheduleBlockingSync) {
    475     *flags |= CU_CTX_SCHED_BLOCKING_SYNC;
    476   }
    477 
    478   return true;
    479 }
    480 
    481 /* static */ port::Status CUDADriver::CreateContext(
    482     CUdevice device, DeviceOptions device_options, CudaContext** context) {
    483   *context = nullptr;
    484 
    485   int flags = 0;
    486   if (!DeviceOptionsToContextFlags(device_options, &flags)) {
    487     LOG(WARNING) << "could not convert all device options into context flags";
    488   }
    489 
    490   CUresult res;
    491   CUcontext former_context;
    492   CUcontext new_context;
    493   {
    494     // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
    495     // context creation: see http://b/13248943
    496 
    497 #if CUDA_VERSION >= 7000
    498     {
    499       unsigned int former_primary_context_flags;
    500       int former_primary_context_is_active;
    501       CHECK_EQ(CUDA_SUCCESS,
    502                cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
    503                                           &former_primary_context_is_active));
    504       if (former_primary_context_flags != flags) {
    505         if (former_primary_context_is_active) {
    506           LOG(ERROR)
    507               << "The primary context is active and has a different flag set ("
    508               << former_primary_context_flags << ") than the desired flag set ("
    509               << flags << ").";
    510         } else {
    511           CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
    512         }
    513       }
    514     }
    515 
    516     former_context = CUDADriver::CurrentContextOrDie();
    517     res = cuDevicePrimaryCtxRetain(&new_context, device);
    518     if (former_context != nullptr) {
    519       CUdevice former_device;
    520       if (cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
    521         if (former_device == device) {
    522           if (former_context == new_context) {
    523             VLOG(2) << "The primary context " << former_context
    524                     << " for device " << device
    525                     << " exists before initializing the StreamExecutor.";
    526           } else {
    527             LOG(WARNING)
    528                 << "A non-primary context " << former_context << " for device "
    529                 << device
    530                 << " exists before initializing the StreamExecutor. The "
    531                 << "primary context is now " << new_context << ". We "
    532                 << "haven't verified StreamExecutor works with that.";
    533           }
    534         }
    535       } else {
    536         LOG(ERROR) << "Failed to get the device of the current context "
    537                    << former_context;
    538       }
    539     }
    540 #else
    541     former_context = CurrentContext();
    542     if (former_context != nullptr) {
    543       LOG(WARNING)
    544           << "creating context when one is currently active; existing: "
    545           << former_context;
    546     }
    547     res = cuCtxCreate(&new_context, flags, device);
    548 #endif
    549   }
    550   CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(former_context));
    551 
    552   if (res == CUDA_SUCCESS) {
    553     *context = CreatedContexts::Add(new_context);
    554     CHECK(*context != nullptr)
    555         << "success in this call must entail non-null result";
    556     VLOG(2) << "created context " << context << " for this thread";
    557     return port::Status::OK();
    558   }
    559 
    560 #if CUDA_VERSION >= 7000
    561   string message = "failed call to cuDevicePrimaryCtxRetain: " + ToString(res);
    562 #else
    563   string message = "failed call to cuCtxCreate: " + ToString(res);
    564 #endif
    565   if (res == CUDA_ERROR_OUT_OF_MEMORY) {
    566     uint64 total_memory;
    567     if (GetDeviceTotalMemory(device, &total_memory)) {
    568       port::StrAppend(&message, "; total memory reported: ", total_memory);
    569     } else {
    570       port::StrAppend(&message, "; could not query total memory");
    571     }
    572   }
    573 
    574   return port::Status{port::error::INTERNAL, message};
    575 }
    576 
    577 /* static */ void CUDADriver::DestroyContext(CudaContext* context) {
    578   if (context == nullptr) {
    579     return;
    580   }
    581 #if CUDA_VERSION >= 7000
    582   CUcontext former_context = CurrentContext();
    583   CUresult res = cuCtxSetCurrent(context->context());
    584   CUdevice device;
    585   cuCtxGetDevice(&device);
    586   cuCtxSetCurrent(former_context);
    587 
    588   res = cuDevicePrimaryCtxRelease(device);
    589 #else
    590   CUresult res = cuCtxDestroy(context->context());
    591 #endif
    592 
    593   if (res != CUDA_SUCCESS) {
    594     LOG(ERROR) << "failed to release CUDA context; leaking: " << ToString(res);
    595   }
    596 
    597   CreatedContexts::Remove(context->context());
    598 }
    599 
    600 /* static */ bool CUDADriver::FuncGetAttribute(CUfunction_attribute attribute,
    601                                                CUfunction func,
    602                                                int *attribute_value) {
    603   CUresult res = cuFuncGetAttribute(attribute_value, attribute, func);
    604   if (res != CUDA_SUCCESS) {
    605     LOG(ERROR) << "failed to query kernel attribute. kernel: " << func
    606                << ", attribute: " << attribute;
    607     return false;
    608   }
    609   return true;
    610 }
    611 
    612 /* static */ bool CUDADriver::FuncSetCacheConfig(CUfunction function,
    613                                                  CUfunc_cache cache_config) {
    614   CUresult res = cuFuncSetCacheConfig(function, cache_config);
    615   if (res != CUDA_SUCCESS) {
    616     LOG(ERROR) << "failed to set CUDA kernel cache config. kernel: " << function
    617                << ", config: " << cache_config << ", result: " << ToString(res);
    618     return false;
    619   }
    620 
    621   return true;
    622 }
    623 
    624 /* static */ port::StatusOr<CUsharedconfig>
    625 CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
    626   CUsharedconfig shared_mem_config;
    627   ScopedActivateContext activation{context};
    628   CUresult result = cuCtxGetSharedMemConfig(&shared_mem_config);
    629   if (result != CUDA_SUCCESS) {
    630     CUdevice device;
    631     cuCtxGetDevice(&device);
    632     LOG(ERROR) << "failed to get CUDA device shared memory config. "
    633                << "Context device ID: " << device
    634                << ", result: " << ToString(result);
    635     return port::Status{
    636         port::error::INTERNAL,
    637         port::StrCat("failed to get shared memory config: ", ToString(result))};
    638   }
    639   return shared_mem_config;
    640 }
    641 
    642 /* static */ port::Status CUDADriver::ContextSetSharedMemConfig(
    643     CudaContext* context, CUsharedconfig shared_mem_config) {
    644   ScopedActivateContext activation{context};
    645   CUresult result = cuCtxSetSharedMemConfig(shared_mem_config);
    646   if (result != CUDA_SUCCESS) {
    647     CUdevice device;
    648     cuCtxGetDevice(&device);
    649     LOG(ERROR) << "failed to set CUDA device shared memory config. "
    650                << "Context device ID: " << device
    651                << ", config: " << shared_mem_config
    652                << ", result: " << ToString(result);
    653     return port::Status{
    654         port::error::INTERNAL,
    655         port::StrCat("failed to set shared memory config: ", ToString(result))};
    656   }
    657   return port::Status::OK();
    658 }
    659 
    660 /* static */ bool CUDADriver::LaunchKernel(
    661     CudaContext* context, CUfunction function, unsigned int grid_dim_x,
    662     unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x,
    663     unsigned int block_dim_y, unsigned int block_dim_z,
    664     unsigned int shared_mem_bytes, CUstream stream, void **kernel_params,
    665     void **extra) {
    666   ScopedActivateContext activation{context};
    667   VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x
    668           << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
    669           << " bdx: " << block_dim_x << " bdy: " << block_dim_y
    670           << " bdz: " << block_dim_z;
    671   CUresult res = cuLaunchKernel(function, grid_dim_x, grid_dim_y, grid_dim_z,
    672                                 block_dim_x, block_dim_y, block_dim_z,
    673                                 shared_mem_bytes, stream, kernel_params, extra);
    674   if (res != CUDA_SUCCESS) {
    675     LOG(ERROR) << "failed to launch CUDA kernel: " << function
    676                << "; result: " << ToString(res);
    677     return false;
    678   }
    679   VLOG(2) << "successfully launched kernel";
    680   return true;
    681 }
    682 
    683 /* static */ port::Status CUDADriver::LoadCubin(CudaContext* context,
    684                                                 const char *cubin_bytes,
    685                                                 CUmodule *module) {
    686   ScopedActivateContext activation{context};
    687   CUresult result = cuModuleLoadFatBinary(module, cubin_bytes);
    688   if (result != CUDA_SUCCESS) {
    689     return port::Status{port::error::INTERNAL,
    690                         "failed to load in-memory CUBIN: " + ToString(result)};
    691   }
    692 
    693   return port::Status::OK();
    694 }
    695 
    696 /* static */ bool CUDADriver::LoadPtx(CudaContext* context,
    697                                       const char *ptx_contents,
    698                                       CUmodule *module) {
    699   port::Notification notification;
    700   bool ret = true;
    701   GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret,
    702                                  &notification]() {
    703     ScopedActivateContext activation{context};
    704     void *ptx_data = const_cast<char *>(ptx_contents);
    705     static const unsigned int kLogBufferBytesLimit = 1024;
    706     unsigned int error_log_buffer_bytes = kLogBufferBytesLimit;
    707     unsigned int info_log_buffer_bytes = kLogBufferBytesLimit;
    708     port::InlinedVector<char, 4> error_log_buffer(error_log_buffer_bytes);
    709     port::InlinedVector<char, 4> info_log_buffer(info_log_buffer_bytes);
    710     bool log_verbose = true;
    711     CUjit_option options[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
    712                               CU_JIT_ERROR_LOG_BUFFER,
    713                               CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
    714                               CU_JIT_INFO_LOG_BUFFER, CU_JIT_LOG_VERBOSE};
    715     // Note that the driver API wants the contents of this values to be stored
    716     // in an array of void*s, so we coerce them accordingly.
    717     void *option_values[] = {
    718         port::bit_cast<void *>(uintptr_t(error_log_buffer_bytes)),
    719         port::bit_cast<void *>(error_log_buffer.data()),
    720         port::bit_cast<void *>(uintptr_t(info_log_buffer_bytes)),
    721         port::bit_cast<void *>(info_log_buffer.data()),
    722         port::bit_cast<void *>(uintptr_t(log_verbose))};
    723     CHECK(ARRAYSIZE(options) == ARRAYSIZE(option_values));
    724 
    725     CUresult res;
    726     {
    727       // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
    728       // module loading: see http://b/13248943
    729 
    730       res = cuModuleLoadDataEx(module, ptx_data, ARRAYSIZE(options), options,
    731                                option_values);
    732     }
    733 
    734     // The PTX JIT mutates the values in the option values array to reflect the
    735     // size of the logs it output; now that we've made the call, read the values
    736     // back out.
    737     error_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[0]);
    738     info_log_buffer_bytes = reinterpret_cast<uintptr_t>(option_values[2]);
    739     CHECK_LE(error_log_buffer_bytes, kLogBufferBytesLimit);
    740     CHECK_LE(info_log_buffer_bytes, kLogBufferBytesLimit);
    741 
    742     if (res != CUDA_SUCCESS) {
    743       LOG(ERROR) << "failed to load PTX text as a module: " << ToString(res);
    744       // As a precaution for null termination of the API-provided value, ensure
    745       // that at least the last byte is null.
    746       error_log_buffer[error_log_buffer_bytes ?
    747                        error_log_buffer_bytes - 1 : 0] = '\0';
    748       LOG(ERROR) << "error log buffer (" << error_log_buffer_bytes
    749                  << " bytes): " << error_log_buffer.data();
    750       ret = false;
    751       notification.Notify();
    752     }
    753 
    754     VLOG(3) << "PTX compilation info log (" << info_log_buffer_bytes
    755             << " bytes): " << info_log_buffer.data();
    756     VLOG(3) << "PTX compilation error log (" << error_log_buffer_bytes
    757             << " bytes): " << error_log_buffer.data();
    758     CHECK(module != nullptr);
    759     notification.Notify();
    760   });
    761   notification.WaitForNotification();
    762 
    763   return ret;
    764 }
    765 
    766 /* static */ bool CUDADriver::SynchronousMemsetUint8(CudaContext* context,
    767                                                      CUdeviceptr location,
    768                                                      uint8 value, size_t size) {
    769   ScopedActivateContext activation{context};
    770   CUresult res = cuMemsetD8(location, value, size);
    771   if (res != CUDA_SUCCESS) {
    772     LOG(ERROR) << "failed to memset memory: " << ToString(res);
    773     return false;
    774   }
    775   return true;
    776 }
    777 
    778 /* static */ bool CUDADriver::SynchronousMemsetUint32(CudaContext* context,
    779                                                       CUdeviceptr location,
    780                                                       uint32 value,
    781                                                       size_t uint32_count) {
    782   ScopedActivateContext activation{context};
    783   CUresult res = cuMemsetD32(location, value, uint32_count);
    784   if (res != CUDA_SUCCESS) {
    785     LOG(ERROR) << "failed to memset memory: " << ToString(res);
    786     return false;
    787   }
    788   return true;
    789 }
    790 
    791 /* static */ bool CUDADriver::AsynchronousMemsetUint8(CudaContext* context,
    792                                                       CUdeviceptr location,
    793                                                       uint8 value,
    794                                                       size_t uint32_count,
    795                                                       CUstream stream) {
    796   ScopedActivateContext activation{context};
    797   CUresult res = cuMemsetD8Async(location, value, uint32_count, stream);
    798   if (res != CUDA_SUCCESS) {
    799     LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
    800     return false;
    801   }
    802   VLOG(2) << "successfully enqueued async memset operation";
    803   return true;
    804 }
    805 
    806 /* static */ bool CUDADriver::AsynchronousMemsetUint32(CudaContext* context,
    807                                                        CUdeviceptr location,
    808                                                        uint32 value,
    809                                                        size_t uint32_count,
    810                                                        CUstream stream) {
    811   ScopedActivateContext activation{context};
    812   CUresult res = cuMemsetD32Async(location, value, uint32_count, stream);
    813   if (res != CUDA_SUCCESS) {
    814     LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
    815     return false;
    816   }
    817   VLOG(2) << "successfully enqueued async memset operation";
    818   return true;
    819 }
    820 
    821 /* static */ bool CUDADriver::AddStreamCallback(CudaContext* context,
    822                                                 CUstream stream,
    823                                                 StreamCallback callback,
    824                                                 void *data) {
    825   // Note: flags param is required to be zero according to CUDA 6.0.
    826   CUresult res = cuStreamAddCallback(stream, callback, data, 0 /* = flags */);
    827   if (res != CUDA_SUCCESS) {
    828     LOG(ERROR) << "unable to add host callback: " << ToString(res);
    829     return false;
    830   }
    831   return true;
    832 }
    833 
    834 /* static */ bool CUDADriver::GetModuleFunction(CudaContext *context,
    835                                                 CUmodule module,
    836                                                 const char *kernel_name,
    837                                                 CUfunction *function) {
    838   ScopedActivateContext activated{context};
    839   CHECK(module != nullptr && kernel_name != nullptr);
    840   CUresult res = cuModuleGetFunction(function, module, kernel_name);
    841   if (res != CUDA_SUCCESS) {
    842     LOG(ERROR) << "failed to get PTX kernel \"" << kernel_name
    843                << "\" from module: " << ToString(res);
    844     return false;
    845   }
    846 
    847   return true;
    848 }
    849 
    850 /* static */ bool CUDADriver::GetModuleSymbol(CudaContext* context,
    851                                               CUmodule module,
    852                                               const char *symbol_name,
    853                                               CUdeviceptr *dptr,
    854                                               size_t *bytes) {
    855   ScopedActivateContext activated{context};
    856   CHECK(module != nullptr && symbol_name != nullptr &&
    857         (dptr != nullptr || bytes != nullptr));
    858   CUresult res = cuModuleGetGlobal(dptr, bytes, module, symbol_name);
    859   if (res != CUDA_SUCCESS) {
    860     // symbol may not be found in the current module, but it may reside in
    861     // another module.
    862     VLOG(2) << "failed to get symbol \"" << symbol_name
    863             << "\" from module: " << ToString(res);
    864     return false;
    865   }
    866 
    867   return true;
    868 }
    869 
    870 /* static */ void CUDADriver::UnloadModule(CudaContext *context,
    871                                            CUmodule module) {
    872   ScopedActivateContext activated{context};
    873   CUresult res = cuModuleUnload(module);
    874   if (res != CUDA_SUCCESS) {
    875     LOG(ERROR) << "failed to unload module " << module
    876                << "; leaking: " << ToString(res);
    877   }
    878 }
    879 
    880 /* static */ port::StatusOr<CUdevice> CUDADriver::DeviceFromContext(
    881     CudaContext* context) {
    882   ScopedActivateContext activated{context};
    883   CUdevice device = -1;
    884   CUresult result = cuCtxGetDevice(&device);
    885   if (result == CUDA_SUCCESS) {
    886     return device;
    887   }
    888 
    889   return port::Status{
    890       port::error::INTERNAL,
    891       port::StrCat("failed to get device for context: ", ToString(result))};
    892 }
    893 
    894 /* static */ bool CUDADriver::CreateStream(CudaContext *context,
    895                                            CUstream *out) {
    896   // TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess
    897   // up synchronization with respect to memsets and any other things that have
    898   // to occur on the default stream?
    899   ScopedActivateContext activated{context};
    900   CUresult res = cuStreamCreate(out, 0);
    901   if (res != CUDA_SUCCESS) {
    902     LOG(ERROR) << "could not allocate CUDA stream for context " << context
    903                << ": " << ToString(res);
    904     return false;
    905   }
    906 
    907   VLOG(2) << "successfully created stream " << *out << " for context "
    908           << context << " on thread";
    909   return true;
    910 }
    911 
    912 /* static */ void CUDADriver::DestroyStream(CudaContext* context,
    913                                             CUstream *stream) {
    914   if (*stream == nullptr) {
    915     return;
    916   }
    917 
    918   ScopedActivateContext activated{context};
    919   CUresult res = cuStreamDestroy(*stream);
    920   if (res != CUDA_SUCCESS) {
    921     LOG(ERROR) << "failed to destroy CUDA stream for context " << context
    922                << ": " << ToString(res);
    923   } else {
    924     VLOG(2) << "successfully destroyed stream " << *stream << " for context "
    925             << context;
    926     *stream = nullptr;
    927   }
    928 }
    929 
    930 /* static */ void *CUDADriver::DeviceAllocate(CudaContext *context,
    931                                               uint64 bytes) {
    932   ScopedActivateContext activated{context};
    933   CUdeviceptr result = 0;
    934   CUresult res = cuMemAlloc(&result, bytes);
    935   if (res != CUDA_SUCCESS) {
    936     LOG(ERROR) << "failed to allocate "
    937                << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
    938                << " bytes) from device: " << ToString(res);
    939     return nullptr;
    940   }
    941   void *ptr = reinterpret_cast<void *>(result);
    942   VLOG(2) << "allocated " << ptr << " for context " << context << " of "
    943           << bytes << " bytes";
    944   return ptr;
    945 }
    946 
    947 /* static */ void CUDADriver::DeviceDeallocate(CudaContext* context,
    948                                                void *location) {
    949   ScopedActivateContext activation{context};
    950   CUdeviceptr pointer = port::bit_cast<CUdeviceptr>(location);
    951   CUresult res = cuMemFree(pointer);
    952   if (res != CUDA_SUCCESS) {
    953     LOG(ERROR) << "failed to free device memory at " << location
    954                << "; result: " << ToString(res);
    955   } else {
    956     VLOG(2) << "deallocated " << location << " for context " << context;
    957   }
    958 }
    959 
    960 /* static */ void *CUDADriver::HostAllocate(CudaContext *context,
    961                                             uint64 bytes) {
    962   ScopedActivateContext activation{context};
    963   void *host_mem = nullptr;
    964   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
    965   CUresult res = cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE);
    966   if (res != CUDA_SUCCESS) {
    967     LOG(ERROR) << "failed to alloc " << bytes
    968                << " bytes on host: " << ToString(res);
    969   }
    970   return host_mem;
    971 }
    972 
    973 /* static */ void CUDADriver::HostDeallocate(CudaContext* context,
    974                                              void *location) {
    975   ScopedActivateContext activation{context};
    976   CUresult res = cuMemFreeHost(location);
    977   if (res != CUDA_SUCCESS) {
    978     LOG(ERROR) << "error deallocating host memory at " << location << ": "
    979                << ToString(res);
    980   }
    981 }
    982 
    983 /* static */ bool CUDADriver::HostRegister(CudaContext* context, void *location,
    984                                            uint64 bytes) {
    985   ScopedActivateContext activation{context};
    986   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
    987   CUresult res =
    988       cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
    989   if (res != CUDA_SUCCESS) {
    990     LOG(ERROR) << "error registering host memory at " << location << ": "
    991                << ToString(res);
    992     return false;
    993   }
    994   return true;
    995 }
    996 
    997 /* static */ bool CUDADriver::HostUnregister(CudaContext* context,
    998                                              void *location) {
    999   ScopedActivateContext activation{context};
   1000   CUresult res = cuMemHostUnregister(location);
   1001   if (res != CUDA_SUCCESS) {
   1002     LOG(ERROR) << "error unregistering host memory at " << location << ": "
   1003                << ToString(res);
   1004     return false;
   1005   }
   1006   return true;
   1007 }
   1008 
   1009 /* static */ port::Status CUDADriver::DestroyEvent(CudaContext* context,
   1010                                                    CUevent *event) {
   1011   if (*event == nullptr) {
   1012     return port::Status{port::error::INVALID_ARGUMENT,
   1013                         "input event cannot be null"};
   1014   }
   1015 
   1016   ScopedActivateContext activated{context};
   1017   CUresult res = cuEventDestroy(*event);
   1018   *event = nullptr;
   1019 
   1020   switch (res) {
   1021     case CUDA_SUCCESS:
   1022       return port::Status::OK();
   1023     case CUDA_ERROR_DEINITIALIZED:
   1024     case CUDA_ERROR_NOT_INITIALIZED:
   1025       return port::Status{
   1026           port::error::FAILED_PRECONDITION,
   1027           port::Printf("error destroying CUDA event in context %p: %s", context,
   1028                        ToString(res).c_str())};
   1029     default:
   1030       return port::Status{
   1031           port::error::INTERNAL,
   1032           port::Printf("error destroying CUDA event in context %p: %s", context,
   1033                        ToString(res).c_str())};
   1034   }
   1035 }
   1036 
   1037 /* static */ port::Status CUDADriver::RecordEvent(CudaContext* context,
   1038                                                   CUevent event,
   1039                                                   CUstream stream) {
   1040   ScopedActivateContext activated{context};
   1041   CUresult res = cuEventRecord(event, stream);
   1042   switch (res) {
   1043     case CUDA_SUCCESS:
   1044       return port::Status::OK();
   1045     case CUDA_ERROR_DEINITIALIZED:
   1046     case CUDA_ERROR_NOT_INITIALIZED:
   1047       return port::Status{
   1048           port::error::FAILED_PRECONDITION,
   1049           port::Printf("error recording CUDA event on stream %p: %s", stream,
   1050                        ToString(res).c_str())};
   1051     default:
   1052       return port::Status{
   1053           port::error::INVALID_ARGUMENT,
   1054           port::Printf("error recording CUDA event on stream %p: %s", stream,
   1055                        ToString(res).c_str())};
   1056   }
   1057 }
   1058 
   1059 /* static */ port::StatusOr<CUresult> CUDADriver::QueryEvent(
   1060     CudaContext *context, CUevent event) {
   1061   ScopedActivateContext activated{context};
   1062   CUresult res = cuEventQuery(event);
   1063   if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
   1064     return port::Status{
   1065         port::error::INTERNAL,
   1066         port::Printf("failed to query event: %s", ToString(res).c_str())};
   1067   }
   1068 
   1069   return res;
   1070 }
   1071 
   1072 /* static */ bool CUDADriver::GetEventElapsedTime(CudaContext* context,
   1073                                                   float *elapsed_milliseconds,
   1074                                                   CUevent start, CUevent stop) {
   1075   ScopedActivateContext activated{context};
   1076   // The stop event must have completed in order for cuEventElapsedTime to
   1077   // work.
   1078   CUresult res = cuEventSynchronize(stop);
   1079   if (res != CUDA_SUCCESS) {
   1080     LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
   1081     return false;
   1082   }
   1083   res = cuEventElapsedTime(elapsed_milliseconds, start, stop);
   1084   if (res != CUDA_SUCCESS) {
   1085     LOG(ERROR) << "failed to get elapsed time between events: "
   1086                << ToString(res);
   1087     return false;
   1088   }
   1089 
   1090   return true;
   1091 }
   1092 
   1093 /* static */ bool CUDADriver::WaitStreamOnEvent(CudaContext* context,
   1094                                                 CUstream stream,
   1095                                                 CUevent event) {
   1096   ScopedActivateContext activation{context};
   1097   CUresult res = cuStreamWaitEvent(stream, event, 0 /* = flags */);
   1098   if (res != CUDA_SUCCESS) {
   1099     LOG(ERROR) << "could not wait stream on event: " << ToString(res);
   1100     return false;
   1101   }
   1102 
   1103   return true;
   1104 }
   1105 
   1106 /* static */ bool CUDADriver::SynchronizeContext(CudaContext* context) {
   1107   ScopedActivateContext activation{context};
   1108   CUresult res = cuCtxSynchronize();
   1109   if (res != CUDA_SUCCESS) {
   1110     LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res)
   1111                << " :: " << port::CurrentStackTrace();
   1112     return false;
   1113   }
   1114 
   1115   return true;
   1116 }
   1117 
   1118 /* static */ port::Status CUDADriver::SynchronizeStream(CudaContext *context,
   1119                                                         CUstream stream) {
   1120   ScopedActivateContext activated{context};
   1121   CHECK(stream != nullptr);
   1122   CUresult res = cuStreamSynchronize(stream);
   1123   if (res != CUDA_SUCCESS) {
   1124     port::Status status = port::InternalError(
   1125         port::StrCat("could not synchronize on CUDA stream: ", ToString(res)));
   1126     LOG(ERROR) << status << " :: " << port::CurrentStackTrace();
   1127     return status;
   1128   }
   1129   VLOG(2) << "successfully synchronized stream " << stream << " on context "
   1130           << context;
   1131   return port::Status::OK();
   1132 }
   1133 
   1134 /* static */ bool CUDADriver::IsStreamIdle(CudaContext *context,
   1135                                            CUstream stream) {
   1136   ScopedActivateContext activated{context};
   1137   CHECK(stream != nullptr);
   1138   CUresult res = cuStreamQuery(stream);
   1139   if (res == CUDA_SUCCESS) {
   1140     return true;
   1141   }
   1142 
   1143   if (res != CUDA_ERROR_NOT_READY) {
   1144     LOG(ERROR) << "stream in bad state on status query: " << ToString(res);
   1145   }
   1146   return false;
   1147 }
   1148 
   1149 /* static */ port::Status CUDADriver::SynchronousMemcpyD2H(CudaContext *context,
   1150                                                            void *host_dst,
   1151                                                            CUdeviceptr gpu_src,
   1152                                                            uint64 size) {
   1153   ScopedActivateContext activation{context};
   1154   CUresult res = cuMemcpyDtoH(host_dst, gpu_src, size);
   1155   if (res != CUDA_SUCCESS) {
   1156     return port::InternalError(
   1157         port::Printf("failed to synchronous memcpy from device to host: %s; "
   1158                      "host dst: %p; GPU src: %p; size: %llu=0x%llx",
   1159                      ToString(res).c_str(), host_dst,
   1160                      port::bit_cast<void *>(gpu_src), size, size));
   1161   }
   1162   VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
   1163           << host_dst;
   1164   return port::Status::OK();
   1165 }
   1166 
   1167 /* static */ port::Status CUDADriver::SynchronousMemcpyH2D(CudaContext *context,
   1168                                                            CUdeviceptr gpu_dst,
   1169                                                            const void *host_src,
   1170                                                            uint64 size) {
   1171   ScopedActivateContext activation{context};
   1172   CUresult res = cuMemcpyHtoD(gpu_dst, host_src, size);
   1173   if (res != CUDA_SUCCESS) {
   1174     return port::InternalError(port::Printf(
   1175         "failed to synchronous memcpy from host to device: %s; GPU dst: %p;"
   1176         " host src: %p; size: %llu=0x%llx",
   1177         ToString(res).c_str(), port::bit_cast<void *>(gpu_dst), host_src, size,
   1178         size));
   1179   }
   1180   VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
   1181   return port::Status::OK();
   1182 }
   1183 
   1184 /* static */ port::Status CUDADriver::SynchronousMemcpyD2D(CudaContext *context,
   1185                                                            CUdeviceptr gpu_dst,
   1186                                                            CUdeviceptr gpu_src,
   1187                                                            uint64 size) {
   1188   ScopedActivateContext activation{context};
   1189   CUresult res = cuMemcpyDtoD(gpu_dst, gpu_src, size);
   1190   if (res != CUDA_SUCCESS) {
   1191     return port::InternalError(port::Printf(
   1192         "failed to synchronous memcpy from host to device: %s; GPU dst: %p; "
   1193         "GPU src: %p; size: %llu=0x%llx",
   1194         ToString(res).c_str(), port::bit_cast<void *>(gpu_dst),
   1195         port::bit_cast<void *>(gpu_src), size, size));
   1196   }
   1197   VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
   1198   return port::Status::OK();
   1199 }
   1200 
   1201 /* static */ bool CUDADriver::AsynchronousMemcpyD2H(CudaContext* context,
   1202                                                     void *host_dst,
   1203                                                     CUdeviceptr gpu_src,
   1204                                                     uint64 size,
   1205                                                     CUstream stream) {
   1206   ScopedActivateContext activation{context};
   1207   CUresult res = cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
   1208   if (res != CUDA_SUCCESS) {
   1209     LOG(ERROR) << port::Printf(
   1210         "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
   1211         "GPU src: %p; size: %llu=0x%llx",
   1212         ToString(res).c_str(), host_dst, port::bit_cast<void *>(gpu_src), size, size);
   1213     return false;
   1214   }
   1215   VLOG(2) << "successfully enqueued async memcpy d2h of " << size
   1216           << " bytes from " << port::bit_cast<void *>(gpu_src) << " to " << host_dst
   1217           << " on stream " << stream;
   1218   return true;
   1219 }
   1220 
   1221 /* static */ bool CUDADriver::AsynchronousMemcpyH2D(CudaContext* context,
   1222                                                     CUdeviceptr gpu_dst,
   1223                                                     const void *host_src,
   1224                                                     uint64 size,
   1225                                                     CUstream stream) {
   1226   ScopedActivateContext activation{context};
   1227   CUresult res = cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
   1228   if (res != CUDA_SUCCESS) {
   1229     LOG(ERROR) << port::Printf(
   1230         "failed to enqueue async memcpy from host to device: %s; GPU dst: %p; "
   1231         "host src: %p; size: %llu=0x%llx",
   1232         ToString(res).c_str(), port::bit_cast<void *>(gpu_dst), host_src, size, size);
   1233     return false;
   1234   }
   1235   VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes"
   1236           << " on stream " << stream;
   1237   return true;
   1238 }
   1239 
   1240 /* static */ bool CUDADriver::AsynchronousMemcpyD2D(CudaContext* context,
   1241                                                     CUdeviceptr gpu_dst,
   1242                                                     CUdeviceptr gpu_src,
   1243                                                     uint64 size,
   1244                                                     CUstream stream) {
   1245   ScopedActivateContext activation{context};
   1246   CUresult result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
   1247   if (result != CUDA_SUCCESS) {
   1248     LOG(ERROR) << port::Printf(
   1249         "failed to enqueue async memcpy from device to device: %s"
   1250         "; GPU dst: %p on %s %s"
   1251         "; GPU src: %p on %s %s"
   1252         "; can access? %s; size: %llu=0x%llx",
   1253         ToString(result).c_str(), port::bit_cast<void *>(gpu_dst),
   1254         CUDAPointerToMemorySpaceString(gpu_dst).c_str(),
   1255         CUDAPointerToDeviceString(gpu_dst).c_str(), port::bit_cast<void *>(gpu_src),
   1256         CUDAPointerToMemorySpaceString(gpu_src).c_str(),
   1257         CUDAPointerToDeviceString(gpu_src).c_str(),
   1258         CUDAPointersToCanAccessString(gpu_src, gpu_dst).c_str(), size, size);
   1259 
   1260     return false;
   1261   }
   1262   VLOG(2) << "successfully enqueued async memcpy d2d of " << size << " bytes";
   1263   return true;
   1264 }
   1265 
   1266 /* static */ port::Status CUDADriver::CreateEvent(CudaContext* context,
   1267                                                   CUevent *result,
   1268                                                   EventFlags flags) {
   1269   int cuflags;
   1270   switch (flags) {
   1271     case EventFlags::kDefault:
   1272       cuflags = CU_EVENT_DEFAULT;
   1273       break;
   1274     case EventFlags::kDisableTiming:
   1275       cuflags = CU_EVENT_DISABLE_TIMING;
   1276       break;
   1277     default:
   1278       LOG(FATAL) << "impossible event flags: " << int(flags);
   1279   }
   1280 
   1281   ScopedActivateContext activated{context};
   1282   CUresult res = cuEventCreate(result, cuflags);
   1283 
   1284   if (res == CUDA_SUCCESS) {
   1285     return port::Status::OK();
   1286   } else if (res == CUDA_ERROR_OUT_OF_MEMORY) {
   1287     return port::Status{port::error::RESOURCE_EXHAUSTED,
   1288                         "could not create CUDA event: out of device memory"};
   1289   } else {
   1290     return port::Status{
   1291         port::error::FAILED_PRECONDITION,
   1292         port::StrCat("could not create CUDA event: ", ToString(res))};
   1293   }
   1294 }
   1295 
   1296 /* static */ int CUDADriver::GetDeviceCount() {
   1297   int device_count = 0;
   1298   CUresult res = cuDeviceGetCount(&device_count);
   1299   if (res != CUDA_SUCCESS) {
   1300     LOG(ERROR) << "could not retrieve CUDA device count: " << ToString(res);
   1301     return 0;
   1302   }
   1303 
   1304   if (FLAGS_gpuexec_cuda_device_0_only && device_count > 1) {
   1305     device_count = 1;
   1306   }
   1307   return device_count;
   1308 }
   1309 
   1310 /* static */ port::StatusOr<CudaContext*> CUDADriver::GetPointerContext(
   1311     CUdeviceptr pointer) {
   1312   CudaContext* context = nullptr;
   1313   CUresult result =
   1314       cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
   1315   if (result == CUDA_SUCCESS) {
   1316     CHECK(context != nullptr) << "success should entail non-null context";
   1317     return context;
   1318   }
   1319 
   1320   return port::Status{
   1321       port::error::INTERNAL,
   1322       port::StrCat("failed to query device pointer for context: ",
   1323                    ToString(result))};
   1324 }
   1325 
   1326 /* static */ port::StatusOr<MemorySpace> CUDADriver::GetPointerMemorySpace(
   1327     CUdeviceptr pointer) {
   1328   unsigned int value;
   1329   CUresult result =
   1330       cuPointerGetAttribute(&value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
   1331   if (result == CUDA_SUCCESS) {
   1332     switch (value) {
   1333       case CU_MEMORYTYPE_DEVICE:
   1334         return MemorySpace::kDevice;
   1335       case CU_MEMORYTYPE_HOST:
   1336         return MemorySpace::kHost;
   1337       default:
   1338         return port::Status{
   1339             port::error::INTERNAL,
   1340             port::StrCat("unknown memory space provided by CUDA API: ", value)};
   1341     }
   1342   }
   1343 
   1344   return port::Status{
   1345       port::error::INTERNAL,
   1346       port::StrCat("failed to query device pointer for memory space: ",
   1347                    ToString(result))};
   1348 }
   1349 
   1350 /* static */ port::Status CUDADriver::GetPointerAddressRange(CUdeviceptr dptr,
   1351                                                              CUdeviceptr *base,
   1352                                                              size_t *size) {
   1353   CUresult result = cuMemGetAddressRange(base, size, dptr);
   1354   if (result == CUDA_SUCCESS) {
   1355     return port::Status::OK();
   1356   } else if (result == CUDA_ERROR_NOT_FOUND) {
   1357     // We differentiate between "this pointer is unknown" (return here) and
   1358     // "there was an internal error while performing this operation" (return
   1359     // below).
   1360     return port::Status{
   1361         port::error::NOT_FOUND,
   1362         port::Printf("not a device pointer %p; %s",
   1363                      reinterpret_cast<void *>(dptr), ToString(result).c_str())};
   1364   }
   1365 
   1366   return port::Status{
   1367       port::error::INTERNAL,
   1368       port::Printf("failed to get pointer into for device pointer %p; %s",
   1369                    reinterpret_cast<void *>(dptr), ToString(result).c_str())};
   1370 }
   1371 
   1372 /* static */ port::StatusOr<CUdevice> CUDADriver::GetPointerDevice(
   1373     CUdeviceptr pointer) {
   1374   auto result = GetPointerContext(pointer);
   1375   if (!result.ok()) {
   1376     return result.status();
   1377   }
   1378 
   1379   return DeviceFromContext(result.ValueOrDie());
   1380 }
   1381 
   1382 /* static */ port::Status CUDADriver::GetComputeCapability(int *cc_major,
   1383                                                            int *cc_minor,
   1384                                                            CUdevice device) {
   1385   *cc_major = 0;
   1386   *cc_minor = 0;
   1387   CUresult result = cuDeviceComputeCapability(cc_major, cc_minor, device);
   1388   if (result == CUDA_SUCCESS) {
   1389     return port::Status::OK();
   1390   }
   1391 
   1392   return port::Status{
   1393       port::error::INTERNAL,
   1394       port::Printf("failed to get compute capability for device: %s; %d",
   1395                    ToString(result).c_str(), device)};
   1396 }
   1397 
   1398 // Helper function that turns the integer output of cuDeviceGetAttribute to type
   1399 // T and wraps it in a StatusOr.
   1400 template <typename T>
   1401 static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   1402                                             CUdevice_attribute attribute) {
   1403   int value = -1;
   1404   CUresult result = cuDeviceGetAttribute(&value, attribute, device);
   1405   if (result != CUDA_SUCCESS) {
   1406     return port::Status{
   1407         port::error::NOT_FOUND,
   1408         port::StrCat("could not retrieve CUDA device attribute (", attribute,
   1409                      "): ", ToString(result))};
   1410   }
   1411   T converted = value;
   1412   return converted;
   1413 }
   1414 
   1415 /* static */ port::StatusOr<int> CUDADriver::GetMultiprocessorCount(
   1416     CUdevice device) {
   1417   return GetSimpleAttribute<int>(device,
   1418                                  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
   1419 }
   1420 
   1421 /* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerCore(
   1422     CUdevice device) {
   1423   return GetSimpleAttribute<int64>(
   1424       device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
   1425 }
   1426 
   1427 /* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerBlock(
   1428     CUdevice device) {
   1429   return GetSimpleAttribute<int64>(
   1430       device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
   1431 }
   1432 
   1433 /* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerMultiprocessor(
   1434     CUdevice device) {
   1435   return GetSimpleAttribute<int64>(
   1436       device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);
   1437 }
   1438 
   1439 /* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerBlock(
   1440     CUdevice device) {
   1441   return GetSimpleAttribute<int64>(device,
   1442                                    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
   1443 }
   1444 
   1445 /* static */ port::StatusOr<int64> CUDADriver::GetMaxRegistersPerBlock(
   1446     CUdevice device) {
   1447   return GetSimpleAttribute<int64>(device,
   1448                                    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK);
   1449 }
   1450 
   1451 /* static */ port::StatusOr<int64> CUDADriver::GetThreadsPerWarp(
   1452     CUdevice device) {
   1453   return GetSimpleAttribute<int64>(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE);
   1454 }
   1455 
   1456 /* static */ bool CUDADriver::GetGridLimits(int *x, int *y, int *z,
   1457                                             CUdevice device) {
   1458   int value;
   1459   CUresult res =
   1460       cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
   1461   if (res != CUDA_SUCCESS) {
   1462     LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
   1463     return false;
   1464   }
   1465   *x = value;
   1466 
   1467   res =
   1468       cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
   1469   if (res != CUDA_SUCCESS) {
   1470     LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
   1471     return false;
   1472   }
   1473   *y = value;
   1474 
   1475   res =
   1476       cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
   1477   if (res != CUDA_SUCCESS) {
   1478     LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
   1479     return false;
   1480   }
   1481   *z = value;
   1482   return true;
   1483 }
   1484 
   1485 /* static */ bool CUDADriver::GetDriverVersion(int *driver_version) {
   1486   CUresult res = cuDriverGetVersion(driver_version);
   1487   if (res != CUDA_SUCCESS) {
   1488     LOG(ERROR) << "failed to query driver version: " << ToString(res);
   1489     return false;
   1490   }
   1491 
   1492   return true;
   1493 }
   1494 
   1495 /* static */ bool CUDADriver::GetDeviceProperties(CUdevprop *device_properties,
   1496                                                   int device_ordinal) {
   1497   CUresult res = cuDeviceGetProperties(device_properties, device_ordinal);
   1498   if (res != CUDA_SUCCESS) {
   1499     LOG(ERROR) << "failed to query device properties: " << ToString(res);
   1500     return false;
   1501   }
   1502 
   1503   return true;
   1504 }
   1505 
   1506 /* static */ bool CUDADriver::IsEccEnabled(CUdevice device, bool *result) {
   1507   int value = -1;
   1508   CUresult res =
   1509       cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
   1510   if (res != CUDA_SUCCESS) {
   1511     LOG(ERROR) << "failed to query ECC status: " << ToString(res);
   1512     return false;
   1513   }
   1514 
   1515   *result = value;
   1516   return true;
   1517 }
   1518 
   1519 /* static */ bool CUDADriver::GetDeviceMemoryInfo(CudaContext* context,
   1520                                                   int64 *free_out,
   1521                                                   int64 *total_out) {
   1522   ScopedActivateContext activation{context};
   1523   size_t free = 0;
   1524   size_t total = 0;
   1525   CUresult res = cuMemGetInfo(&free, &total);
   1526   if (res != CUDA_SUCCESS) {
   1527     LOG(ERROR) << "failed to query device memory info: " << ToString(res);
   1528     return false;
   1529   }
   1530 
   1531   *free_out = free;
   1532   *total_out = total;
   1533   return true;
   1534 }
   1535 
   1536 /* static */ bool CUDADriver::GetDeviceTotalMemory(CUdevice device,
   1537                                                    uint64 *result) {
   1538   size_t value = -1;
   1539   CUresult res = cuDeviceTotalMem(&value, device);
   1540   if (res != CUDA_SUCCESS) {
   1541     LOG(ERROR) << "failed to query total available memory: " << ToString(res);
   1542     return false;
   1543   }
   1544 
   1545   *result = value;
   1546   return true;
   1547 }
   1548 
   1549 /* static */ string CUDADriver::GetPCIBusID(CUdevice device) {
   1550   string pci_bus_id;
   1551   static const int kBufferSize = 64;
   1552   port::InlinedVector<char, 4> chars(kBufferSize);
   1553   chars[kBufferSize - 1] = '\0';
   1554   CUresult res = cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
   1555   if (res != CUDA_SUCCESS) {
   1556     LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
   1557     return pci_bus_id;
   1558   }
   1559   pci_bus_id = chars.begin();
   1560   return pci_bus_id;
   1561 }
   1562 
   1563 /* static */ bool CUDADriver::CanEnablePeerAccess(CudaContext* from,
   1564                                                   CudaContext* to) {
   1565   if (from == to) {
   1566     return true;  // A context can always access its own memory.
   1567   }
   1568 
   1569   int can_access_peer = -1;
   1570   auto from_device = DeviceFromContext(from);
   1571   if (!from_device.ok()) {
   1572     LOG(ERROR) << "failed to resolve 'from' peer access context to a device: "
   1573                << from_device.status();
   1574     return false;
   1575   }
   1576   auto to_device = DeviceFromContext(to);
   1577   if (!to_device.ok()) {
   1578     LOG(ERROR) << "failed to resolve 'to' peer access context to a device: "
   1579                << to_device.status();
   1580     return false;
   1581   }
   1582   CUresult res = cuDeviceCanAccessPeer(
   1583       &can_access_peer, from_device.ValueOrDie(), to_device.ValueOrDie());
   1584   if (res != CUDA_SUCCESS) {
   1585     LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
   1586     return false;
   1587   }
   1588 
   1589   return can_access_peer;
   1590 }
   1591 
   1592 /* static */ port::Status CUDADriver::EnablePeerAccess(CudaContext* from,
   1593                                                        CudaContext* to) {
   1594   if (from == to) {
   1595     return port::Status::OK();  // A context can always access its own memory.
   1596   }
   1597 
   1598   ScopedActivateContext activated{from};
   1599   CUresult result = cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
   1600   if (result != CUDA_SUCCESS &&
   1601       result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
   1602     return port::Status{
   1603         port::error::INTERNAL,
   1604         port::Printf("failed to enable peer access from %p to %p: %s", from, to,
   1605                      ToString(result).c_str())};
   1606   }
   1607 
   1608   return port::Status::OK();
   1609 }
   1610 
   1611 /* static */ port::StatusOr<int> CUDADriver::GetMaxOccupiedBlocksPerCore(
   1612     CudaContext* context, CUfunction kernel, int threads_per_block,
   1613     size_t dynamic_shared_memory_bytes) {
   1614   ScopedActivateContext activation{context};
   1615 
   1616   int max_blocks;
   1617   CUresult result = cuOccupancyMaxActiveBlocksPerMultiprocessor(
   1618       &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes);
   1619   if (result != CUDA_SUCCESS) {
   1620     return port::Status{
   1621         port::error::INTERNAL,
   1622         port::Printf("failed to calculate occupancy of kernel %p: %s", kernel,
   1623                      ToString(result).c_str())};
   1624   }
   1625 
   1626   return max_blocks;
   1627 }
   1628 
   1629 /* static */ CUcontext CUDADriver::CurrentContextOrDie() {
   1630   CUcontext current = nullptr;
   1631   CUresult result = cuCtxGetCurrent(&current);
   1632   if (result != CUDA_SUCCESS) {
   1633     LOG(FATAL) << "failed to query current context: " << ToString(result);
   1634   }
   1635   return current;
   1636 }
   1637 
   1638 }  // namespace cuda
   1639 }  // namespace gputools
   1640 }  // namespace perftools
   1641