Home | History | Annotate | Download | only in cuda
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // CUDA userspace driver library wrapper functionality.
     17 
     18 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
     19 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
     20 
     21 #include <stddef.h>
     22 #include "tensorflow/stream_executor/platform/port.h"
     23 
     24 #include "tensorflow/stream_executor/device_options.h"
     25 #include "tensorflow/stream_executor/lib/status.h"
     26 #include "tensorflow/stream_executor/lib/statusor.h"
     27 #include "tensorflow/stream_executor/platform/port.h"
     28 #include "cuda/include/cuda.h"
     29 
     30 namespace perftools {
     31 namespace gputools {
     32 namespace cuda {
     33 
     34 // Identifies the memory space where an allocation resides. See
     35 // CUDADriver::GetPointerMemorySpace().
     36 enum class MemorySpace { kHost, kDevice };
     37 
     38 // Returns a casual string, such as "host" for the provided memory space.
     39 string MemorySpaceString(MemorySpace memory_space);
     40 
     41 class CudaContext;
     42 
     43 // CUDADriver contains wrappers for calls to the userspace library driver. It's
     44 // useful to isolate these calls and put basic wrappers around them to separate
     45 // userspace library driver behaviors from the rest of the program.
     46 //
     47 // At the moment it's simply used as a namespace.
     48 //
     49 // The calls log any specific errors internally and return whether the operation
     50 // was successful to the caller.
     51 //
     52 // The order of parameters is generally kept symmetric with the underlying CUDA
     53 // driver API.
     54 //
     55 // Links on functions are to specific documentation under
     56 // http://docs.nvidia.com/cuda/cuda-driver-api/
     57 //
     58 // Thread safety: these functions should not be used from signal handlers.
     59 class CUDADriver {
     60  public:
     61   // Wraps a call to cuInit with logging to help indicate what has gone wrong in
     62   // the case of failure. Safe to call multiple times; will be fast on all calls
     63   // after the first.
     64   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZE_1g0a2f1517e1bd8502c7194c3a8c134bc3
     65   static port::Status Init();
     66 
     67   // Returns the device associated with the given context.
     68   // device is an outparam owned by the caller, must not be null.
     69   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g4e84b109eba36cdaaade167f34ae881e
     70   static port::StatusOr<CUdevice> DeviceFromContext(CudaContext* context);
     71 
     72   // Creates a new CUDA stream associated with the given context via
     73   // cuStreamCreate.
     74   // stream is an outparam owned by the caller, must not be null.
     75   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4
     76   static bool CreateStream(CudaContext* context, CUstream *stream);
     77 
     78   // Destroys a CUDA stream associated with the given context.
     79   // stream is owned by the caller, must not be null, and *stream is set to null
     80   // if the stream is successfully destroyed.
     81   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c8833de4596bcd31a06cdf21ee758
     82   static void DestroyStream(CudaContext* context, CUstream *stream);
     83 
     84   // CUDA events can explicitly disable event TSC retrieval for some presumed
     85   // performance improvement if timing is unnecessary.
     86   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
     87   enum class EventFlags { kDefault, kDisableTiming };
     88 
     89   // Creates a new event associated with the given context.
     90   // result is an outparam owned by the caller and must not be null.
     91   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
     92   static port::Status CreateEvent(CudaContext* context, CUevent *result,
     93                                   EventFlags flags);
     94 
     95   // Destroys *event and turns it into a nullptr. event may not be null, but
     96   // *event may be, via cuEventDestroy
     97   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g593ec73a8ec5a5fc031311d3e4dca1ef
     98   static port::Status DestroyEvent(CudaContext* context, CUevent *event);
     99 
    100   // Allocates a GPU memory space of size bytes associated with the given
    101   // context via cuMemAlloc.
    102   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb82d2a09844a58dd9e744dc31e8aa467
    103   static void *DeviceAllocate(CudaContext* context, uint64 bytes);
    104 
    105   // Deallocates a GPU memory space of size bytes associated with the given
    106   // context via cuMemFree.
    107   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
    108   static void DeviceDeallocate(CudaContext* context, void *location);
    109 
    110   // Allocates page-locked and CUDA-registered memory on the host via
    111   // cuMemAllocHost.
    112   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0
    113   static void *HostAllocate(CudaContext* context, uint64 bytes);
    114 
    115   // Deallocates a location created by HostAllocate, via cuMemFreeHost.
    116   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g62e0fdbe181dab6b1c90fa1a51c7b92c
    117   static void HostDeallocate(CudaContext* context, void *location);
    118 
    119   // Registers a memory region at location of size bytes via cuMemHostRegister.
    120   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223
    121   static bool HostRegister(CudaContext* context, void *location, uint64 bytes);
    122 
    123   // Unregisters a memory region that was previously registered at location via
    124   // cuMemHostUnregister.
    125   //
    126   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g63f450c8125359be87b7623b1c0b2a14
    127   //
    128   // TODO(leary) verify an error will be returned if the location wasn't
    129   // previously registered.
    130   static bool HostUnregister(CudaContext* context, void *location);
    131 
    132   // Given a device ordinal, returns a device handle into the device outparam,
    133   // which must not be null.
    134   //
    135   // N.B. these device handles do not have a corresponding destroy function in
    136   // the CUDA driver API.
    137   static port::Status GetDevice(int device_ordinal, CUdevice *device);
    138 
    139   // Given a device handle, returns the name reported by the driver for the
    140   // device.
    141   static bool GetDeviceName(CUdevice device, string *name_out);
    142 
    143   // Given a device to create a context for, returns a context handle into the
    144   // context outparam, which must not be null.
    145   //
    146   // N.B. CUDA contexts are weird. They are implicitly associated with the
    147   // calling thread. Current documentation on contexts and their influence on
    148   // userspace processes is given here:
    149   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf
    150   static port::Status CreateContext(CUdevice device,
    151                                     DeviceOptions device_options,
    152                                     CudaContext** context);
    153 
    154   // Destroys the provided context via cuCtxDestroy.
    155   // Don't do this while clients could still be using the context, per the docs
    156   // bad things will happen.
    157   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e
    158   static void DestroyContext(CudaContext* context);
    159 
    160   // Queries the runtime for the specified attribute of the specified function.
    161   // cuFuncGetAttribute (the underlying CUDA driver API routine) only operates
    162   // in terms of integer-sized values, so there's no potential for overrun (as
    163   // of CUDA 5.5).
    164   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b
    165   static bool FuncGetAttribute(CUfunction_attribute attribute,
    166                                CUfunction function, int *attribute_value);
    167 
    168   // Sets the preferred cache configuration for the specified function.
    169   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g40f8c11e81def95dc0072a375f965681
    170   static bool FuncSetCacheConfig(CUfunction function,
    171                                  CUfunc_cache cache_config);
    172 
    173   // Gets the preferred shared memory bank configuration for the specified
    174   // CONTEXT (not function!), either default or four- or eight-byte bank size.
    175   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g17153a1b8b8c756f7ab8505686a4ad74
    176   static port::StatusOr<CUsharedconfig> ContextGetSharedMemConfig(
    177       CudaContext* context);
    178 
    179   // Sets the preferred shared memory bank configuration for the specified
    180   // CONTEXT (not function!), either default or four- or eight-byte bank size.
    181   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2574235fa643f8f251bf7bc28fac3692
    182   static port::Status ContextSetSharedMemConfig(
    183       CudaContext* context, CUsharedconfig shared_mem_config);
    184 
    185   // Launches a CUDA kernel via cuLaunchKernel.
    186   // TODO(leary) describe the structure of kernel_params and extra in a readable
    187   // way.
    188   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15
    189   static bool LaunchKernel(CudaContext* context, CUfunction function,
    190                            unsigned int grid_dim_x, unsigned int grid_dim_y,
    191                            unsigned int grid_dim_z, unsigned int block_dim_x,
    192                            unsigned int block_dim_y, unsigned int block_dim_z,
    193                            unsigned int shared_mem_bytes, CUstream stream,
    194                            void **kernel_params, void **extra);
    195 
    196   // Loads ptx_contents with the CUDA driver's PTX JIT and stores the resulting
    197   // handle in "module". Any error logs that are produced are logged internally.
    198   static bool LoadPtx(CudaContext* context, const char *ptx_contents,
    199                       CUmodule *module);
    200 
    201   // Loads cubin_bytes with the CUDA driver's blob loading interface and stores
    202   // the resulting handle in "module".
    203   static port::Status LoadCubin(CudaContext* context, const char *cubin_bytes,
    204                                 CUmodule *module);
    205 
    206   // Retrieves a named kernel from a loaded module, and places the resulting
    207   // handle into function (outparam) on success. Neither kernel_name nor
    208   // function may be null. No ownership is taken of kernel_name.
    209   static bool GetModuleFunction(CudaContext* context, CUmodule module,
    210                                 const char *kernel_name, CUfunction *function);
    211 
    212   // Retrieves a named global/constant symbol from a loaded module, and returns
    213   // a device pointer and size of the symbol on success. symbol_name may not be
    214   // null. At least one of dptr or bytes should not be null. No ownership is
    215   // taken of symbol_name.
    216   static bool GetModuleSymbol(CudaContext* context, CUmodule module,
    217                               const char *symbol_name, CUdeviceptr *dptr,
    218                               size_t *bytes);
    219 
    220   // Unloads module from the current context via cuModuleUnload.
    221   // TODO(leary) the documentation doesn't say what kind of disasters happen
    222   // if you try to unload a module while its CUfunctions are in use.
    223   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b
    224   static void UnloadModule(CudaContext* context, CUmodule module);
    225 
    226   // Performs a synchronous memset of the device memory segment via cuMemsetD8.
    227   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6e582bf866e9e2fb014297bfaf354d7b
    228   static bool SynchronousMemsetUint8(CudaContext* context, CUdeviceptr location,
    229                                      uint8 value, size_t size);
    230 
    231   // Performs a synchronous memset of the device memory segment via cuMemsetD32.
    232   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g983e8d8759acd1b64326317481fbf132
    233   static bool SynchronousMemsetUint32(CudaContext* context,
    234                                       CUdeviceptr location, uint32 value,
    235                                       size_t uint32_count);
    236 
    237   // Performs an asynchronous memset of the device memory segment via
    238   // cuMemsetD8Async.
    239   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gaef08a7ccd61112f94e82f2b30d43627
    240   static bool AsynchronousMemsetUint8(CudaContext* context, CUdeviceptr location,
    241                                       uint8 value, size_t uint32_count,
    242                                       CUstream stream);
    243 
    244   // Performs an asynchronous memset of the device memory segment via
    245   // cuMemsetD32Async.
    246   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g58229da5d30f1c0cdf667b320ec2c0f5
    247   static bool AsynchronousMemsetUint32(CudaContext* context,
    248                                        CUdeviceptr location, uint32 value,
    249                                        size_t uint32_count, CUstream stream);
    250 
    251   // -- Synchronous memcopies.
    252   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169
    253 
    254   static port::Status SynchronousMemcpyD2H(CudaContext* context, void* host_dst,
    255                                            CUdeviceptr gpu_src, uint64 size);
    256   static port::Status SynchronousMemcpyH2D(CudaContext* context,
    257                                            CUdeviceptr gpu_dst,
    258                                            const void* host_src, uint64 size);
    259   static port::Status SynchronousMemcpyD2D(CudaContext* context,
    260                                            CUdeviceptr gpu_dst,
    261                                            CUdeviceptr gpu_src, uint64 size);
    262 
    263   // -- Asynchronous memcopies.
    264   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g56f30236c7c5247f8e061b59d3268362
    265 
    266   static bool AsynchronousMemcpyD2H(CudaContext* context, void *host_dst,
    267                                     CUdeviceptr gpu_src, uint64 size,
    268                                     CUstream stream);
    269   static bool AsynchronousMemcpyH2D(CudaContext* context, CUdeviceptr gpu_dst,
    270                                     const void *host_src, uint64 size,
    271                                     CUstream stream);
    272   static bool AsynchronousMemcpyD2D(CudaContext* context, CUdeviceptr gpu_dst,
    273                                     CUdeviceptr gpu_src, uint64 size,
    274                                     CUstream stream);
    275 
    276   // The CUDA stream callback type signature.
    277   // The data passed to AddStreamCallback is subsequently passed to this
    278   // callback when it fires.
    279   //
    280   // Some notable things:
    281   // * Callbacks must not make any CUDA API calls.
    282   // * Callbacks from independent streams execute in an undefined order and may
    283   //   be serialized.
    284   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483
    285   typedef void (*StreamCallback)(CUstream stream, CUresult status, void *data);
    286 
    287   // Enqueues a callback operation into stream.
    288   // See StreamCallback above and the NVIDIA documentation for additional
    289   // details.
    290   static bool AddStreamCallback(CudaContext* context, CUstream stream,
    291                                 StreamCallback callback, void *data);
    292 
    293   // Causes stream to wait for event to trigger before proceeding via
    294   // cuStreamWaitEvent.
    295   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#axzz334nAXAhM
    296   static bool WaitStreamOnEvent(CudaContext* context, CUstream stream,
    297                                 CUevent event);
    298 
    299   // Blocks the calling thread until the operations enqueued onto stream have
    300   // been completed, via cuStreamSynchronize.
    301   //
    302   // TODO(leary) if a pathological thread enqueues operations onto the stream
    303   // while another thread blocks like this, can you wind up waiting an unbounded
    304   // amount of time?
    305   //
    306   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad
    307   static port::Status SynchronizeStream(CudaContext* context, CUstream stream);
    308 
    309   // Blocks the calling thread until the operations associated with the context
    310   // have been completed, via cuCtxSynchronize.
    311   //
    312   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g7a54725f28d34b8c6299f0c6ca579616
    313   static bool SynchronizeContext(CudaContext* context);
    314 
    315   // Returns true if all stream tasks have completed at time of the call. Note
    316   // the potential for races around this call (if another thread adds work to
    317   // the stream immediately after this returns).
    318   static bool IsStreamIdle(CudaContext* context, CUstream stream);
    319 
    320   // Returns whether code in the from context can access memory in the to
    321   // context via cuDeviceCanAccessPeer.
    322   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e
    323   static bool CanEnablePeerAccess(CudaContext* from, CudaContext* to);
    324 
    325   // Enables peer access per CanEnablePeerAccess, via cuCtxEnablePeerAccess.
    326   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a
    327   static port::Status EnablePeerAccess(CudaContext* from, CudaContext* to);
    328 
    329   // Returns the elapsed milliseconds between start and stop via
    330   // cuEventElapsedTime.
    331   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1gdfb1178807353bbcaa9e245da497cf97
    332   static bool GetEventElapsedTime(CudaContext* context,
    333                                   float *elapsed_milliseconds, CUevent start,
    334                                   CUevent stop);
    335 
    336   // Records that an event occurred when execution reaches the current point in
    337   // thestream via cuEventRecord.
    338   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g95424d3be52c4eb95d83861b70fb89d1
    339   static port::Status RecordEvent(CudaContext* context, CUevent event,
    340                                   CUstream stream);
    341 
    342   // Polls (without blocking) to determine the status of an event - pending or
    343   // complete (or an error status).
    344   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6f0704d755066b0ee705749ae911deef
    345   static port::StatusOr<CUresult> QueryEvent(CudaContext* context,
    346                                              CUevent event);
    347 
    348   // -- Pointer-specific calls.
    349 
    350   // Returns the context in which pointer was allocated or registered.
    351   static port::StatusOr<CudaContext*> GetPointerContext(CUdeviceptr pointer);
    352 
    353   // Returns the device associated with the context from GetPointerContext().
    354   static port::StatusOr<CUdevice> GetPointerDevice(CUdeviceptr pointer);
    355 
    356   // Returns the memory space addressed by pointer.
    357   static port::StatusOr<MemorySpace> GetPointerMemorySpace(CUdeviceptr pointer);
    358 
    359   // Returns the base address and size of the device pointer dptr.
    360   static port::Status GetPointerAddressRange(CUdeviceptr dptr,
    361                                              CUdeviceptr *base, size_t *size);
    362 
    363   // -- Device-specific calls.
    364 
    365   // Returns the compute capability for the device; i.e (3, 5).
    366   // This is currently done via the deprecated device API.
    367   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1ge2091bbac7e1fb18c2821612115607ea
    368   static port::Status GetComputeCapability(int *cc_major, int *cc_minor,
    369                                            CUdevice device);
    370 
    371   // Returns the number of multiprocessors on the device (note that the device
    372   // may be multi-GPU-per-board).
    373   static port::StatusOr<int> GetMultiprocessorCount(CUdevice device);
    374 
    375   // Returns the limit on number of threads that can be resident in a single
    376   // multiprocessor.
    377   static port::StatusOr<int64> GetMaxThreadsPerMultiprocessor(CUdevice device);
    378 
    379   // Returns the limit on number of threads which may be resident for a single
    380   // block (cooperative thread array).
    381   static port::StatusOr<int64> GetMaxThreadsPerBlock(CUdevice device);
    382 
    383   // Returns the amount of shared memory available on a single GPU core (i.e.
    384   // SM on NVIDIA devices).
    385   static port::StatusOr<int64> GetMaxSharedMemoryPerCore(CUdevice device);
    386 
    387   // Returns the amount of shared memory available for a single block
    388   // (cooperative thread array).
    389   static port::StatusOr<int64> GetMaxSharedMemoryPerBlock(CUdevice device);
    390 
    391   // Returns the maximum supported number of registers per block.
    392   static port::StatusOr<int64> GetMaxRegistersPerBlock(CUdevice device);
    393 
    394   // Returns the number of threads per warp.
    395   static port::StatusOr<int64> GetThreadsPerWarp(CUdevice device);
    396 
    397   // Queries the grid limits for device with cuDeviceGetAttribute calls.
    398   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
    399   static bool GetGridLimits(int *x, int *y, int *z, CUdevice device);
    400 
    401   // Returns a grab-bag of device properties in a caller-owned device_properties
    402   // structure for device_ordinal via cuDeviceGetProperties.
    403   // This call is deprecated in the NVIDIA driver API.
    404   //
    405   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1g65a5b4e25186bd257df80b98c98cffe6
    406   static bool GetDeviceProperties(CUdevprop *device_properties,
    407                                   int device_ordinal);
    408 
    409   // Returns whether ECC is enabled for the given CUdevice via
    410   // cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED.
    411   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
    412   static bool IsEccEnabled(CUdevice device, bool *result);
    413 
    414   // Returns the total amount of memory available for allocation by the CUDA
    415   // context, in bytes, via cuDeviceTotalMem.
    416   static bool GetDeviceTotalMemory(CUdevice device, uint64 *result);
    417 
    418   // Returns the free amount of memory and total amount of memory, as reported
    419   // by cuMemGetInfo.
    420   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g808f555540d0143a331cc42aa98835c0
    421   static bool GetDeviceMemoryInfo(CudaContext* context, int64* free,
    422                                   int64* total);
    423 
    424   // Returns a PCI bus id string for the device.
    425   // [domain]:[bus]:[device].[function]
    426   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g85295e7d9745ab8f0aa80dd1e172acfc
    427   static string GetPCIBusID(CUdevice device);
    428 
    429   // -- Context- and device-independent calls.
    430 
    431   // Returns the number of visible CUDA device via cuDeviceGetCount.
    432   // This should correspond to the set of device ordinals available.
    433   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g52b5ce05cb8c5fb6831b2c0ff2887c74
    434   static int GetDeviceCount();
    435 
    436   // Returns the driver version number via cuDriverGetVersion.
    437   // This is, surprisingly, NOT the actual driver version (e.g. 331.79) but,
    438   // instead, the CUDA toolkit release number that this driver is compatible
    439   // with; e.g. 6000 (for a CUDA 6.0 compatible driver) or 6050 (for a CUDA 6.5
    440   // compatible driver).
    441   //
    442   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VERSION.html#group__CUDA__VERSION_1g8b7a10395392e049006e61bcdc8ebe71
    443   static bool GetDriverVersion(int *driver_version);
    444 
    445   // -- Other calls
    446 
    447   // Returns the maximum number of blocks (per multiprocessor) occupied by the
    448   // specified kernel/CUfunction when launched with the specified parameters.
    449   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1gcc6e1094d05cba2cee17fe33ddd04a98
    450   static port::StatusOr<int> GetMaxOccupiedBlocksPerCore(
    451       CudaContext* context, CUfunction kernel, int threads_per_block,
    452       size_t dynamic_shared_memory_bytes);
    453 
    454   // Returns the current context set in CUDA. This is done by calling the cuda
    455   // driver (e.g., this value is not our cached view of the current context).
    456   static CUcontext CurrentContextOrDie();
    457 
    458   // Seam for injecting an error at CUDA initialization time for testing
    459   // purposes.
    460   static bool driver_inject_init_error_;
    461 };
    462 
    463 // Ensures a context is activated within a scope.
    464 class ScopedActivateContext {
    465  public:
    466   // Activates the context via cuCtxSetCurrent, if it is not the currently
    467   // active context (a la cuCtxGetCurrent). Note the alternative push/pop
    468   // mechanism is said by NVIDIA to be relatively slow and deprecated.
    469   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gbe562ee6258b4fcc272ca6478ca2a2f7
    470   explicit ScopedActivateContext(CudaContext* context);
    471 
    472   // Checks that the context has remained activated for the duration of the
    473   // scope.
    474   ~ScopedActivateContext();
    475 
    476  private:
    477   CudaContext* to_restore_ = nullptr;
    478 };
    479 
    480 // CudaContext wraps a cuda CUcontext handle, and includes a unique id. The
    481 // unique id is positive, and ids are not repeated within the process.
    482 class CudaContext {
    483  public:
    484   CudaContext(CUcontext context, int64 id) : context_(context), id_(id) { }
    485 
    486   CUcontext context() const { return context_; }
    487   int64 id() const { return id_; }
    488 
    489   // Disallow copying and moving.
    490   CudaContext(CudaContext&&) = delete;
    491   CudaContext(const CudaContext&) = delete;
    492   CudaContext& operator=(CudaContext&&) = delete;
    493   CudaContext& operator=(const CudaContext&) = delete;
    494 
    495  private:
    496   CUcontext const context_;
    497   const int64 id_;
    498 };
    499 
    500 }  // namespace cuda
    501 }  // namespace gputools
    502 }  // namespace perftools
    503 
    504 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
    505